625 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BLOCK_BLK_PM_H_ #define _BLOCK_BLK_PM_H_ #include <linux/pm_runtime.h> #ifdef CONFIG_PM static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) { if (!q->dev || !blk_queue_pm_only(q)) return 1; /* Nothing to do */ if (pm && q->rpm_status != RPM_SUSPENDED) return 1; /* Request allowed */ pm_request_resume(q->dev); return 0; } static inline void blk_pm_mark_last_busy(struct request *rq) { if (rq->q->dev && !(rq->rq_flags & RQF_PM)) pm_runtime_mark_last_busy(rq->q->dev); } #else static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) { return 1; } static inline void blk_pm_mark_last_busy(struct request *rq) { } #endif #endif /* _BLOCK_BLK_PM_H_ */
3179 3188 11 3180 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VIRTIO_RING_H #define _LINUX_VIRTIO_RING_H #include <asm/barrier.h> #include <linux/irqreturn.h> #include <uapi/linux/virtio_ring.h> /* * Barriers in virtio are tricky. Non-SMP virtio guests can't assume * they're not on an SMP host system, so they need to assume real * barriers. Non-SMP virtio hosts could skip the barriers, but does * anyone care? * * For virtio_pci on SMP, we don't need to order with respect to MMIO * accesses through relaxed memory I/O windows, so virt_mb() et al are * sufficient. * * For using virtio to talk to real devices (eg. other heterogeneous * CPUs) we do need real barriers. In theory, we could be using both * kinds of virtio, so it's a runtime decision, and the branch is * actually quite cheap. */ static inline void virtio_mb(bool weak_barriers) { if (weak_barriers) virt_mb(); else mb(); } static inline void virtio_rmb(bool weak_barriers) { if (weak_barriers) virt_rmb(); else dma_rmb(); } static inline void virtio_wmb(bool weak_barriers) { if (weak_barriers) virt_wmb(); else dma_wmb(); } #define virtio_store_mb(weak_barriers, p, v) \ do { \ if (weak_barriers) { \ virt_store_mb(*p, v); \ } else { \ WRITE_ONCE(*p, v); \ mb(); \ } \ } while (0) \ struct virtio_device; struct virtqueue; struct device; /* * Creates a virtqueue and allocates the descriptor ring. If * may_reduce_num is set, then this may allocate a smaller ring than * expected. The caller should query virtqueue_get_vring_size to learn * the actual size of the ring. */ struct virtqueue *vring_create_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); /* * Creates a virtqueue and allocates the descriptor ring with per * virtqueue DMA device. */ struct virtqueue *vring_create_virtqueue_dma(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name, struct device *dma_dev); /* * Creates a virtqueue with a standard layout but a caller-allocated * ring. */ struct virtqueue *vring_new_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool ctx, void *pages, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); /* * Destroys a virtqueue. If created with vring_create_virtqueue, this * also frees the ring. */ void vring_del_virtqueue(struct virtqueue *vq); /* Filter out transport-specific feature bits. */ void vring_transport_features(struct virtio_device *vdev); irqreturn_t vring_interrupt(int irq, void *_vq); u32 vring_notification_data(struct virtqueue *_vq); #endif /* _LINUX_VIRTIO_RING_H */
1272 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 // SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/prctl.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/idle.h> #include <linux/sched/debug.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/init.h> #include <linux/export.h> #include <linux/pm.h> #include <linux/tick.h> #include <linux/random.h> #include <linux/user-return-notifier.h> #include <linux/dmi.h> #include <linux/utsname.h> #include <linux/stackprotector.h> #include <linux/cpuidle.h> #include <linux/acpi.h> #include <linux/elf-randomize.h> #include <linux/static_call.h> #include <trace/events/power.h> #include <linux/hw_breakpoint.h> #include <linux/entry-common.h> #include <asm/cpu.h> #include <asm/apic.h> #include <linux/uaccess.h> #include <asm/mwait.h> #include <asm/fpu/api.h> #include <asm/fpu/sched.h> #include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include <asm/nmi.h> #include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/vm86.h> #include <asm/switch_to.h> #include <asm/desc.h> #include <asm/prctl.h> #include <asm/spec-ctrl.h> #include <asm/io_bitmap.h> #include <asm/proto.h> #include <asm/frame.h> #include <asm/unwind.h> #include <asm/tdx.h> #include <asm/mmu_context.h> #include <asm/shstk.h> #include "process.h" /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, * no more per-task TSS's. The TSS size is kept cacheline-aligned * so they are allowed to end up in the .data..cacheline_aligned * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { .x86_tss = { /* * .sp0 is only used when entering ring 0 from a lower * privilege level. Since the init task never runs anything * but ring 0 code, there is no need for a valid value here. * Poison it. */ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, #ifdef CONFIG_X86_32 .sp1 = TOP_OF_INIT_STACK, .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, #endif .io_bitmap_base = IO_BITMAP_OFFSET_INVALID, }, }; EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); DEFINE_PER_CPU(bool, __tss_limit_invalid); EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); /* * this gets called so that we can store lazy state into memory and copy the * current task into the new thread. */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { memcpy(dst, src, arch_task_struct_size); #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif /* Drop the copied pointer to current's fpstate */ dst->thread.fpu.fpstate = NULL; return 0; } #ifdef CONFIG_X86_64 void arch_release_task_struct(struct task_struct *tsk) { if (fpu_state_size_dynamic()) fpstate_free(&tsk->thread.fpu); } #endif /* * Free thread data structures etc.. */ void exit_thread(struct task_struct *tsk) { struct thread_struct *t = &tsk->thread; struct fpu *fpu = &t->fpu; if (test_thread_flag(TIF_IO_BITMAP)) io_bitmap_exit(tsk); free_vm86(t); shstk_free(tsk); fpu__drop(fpu); } static int set_new_tls(struct task_struct *p, unsigned long tls) { struct user_desc __user *utls = (struct user_desc __user *)tls; if (in_ia32_syscall()) return do_set_thread_area(p, -1, utls, 0); else return do_set_thread_area_64(p, ARCH_SET_FS, tls); } __visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs, int (*fn)(void *), void *fn_arg) { schedule_tail(prev); /* Is this a kernel thread? */ if (unlikely(fn)) { fn(fn_arg); /* * A kernel thread is allowed to return here after successfully * calling kernel_execve(). Exit to userspace to complete the * execve() syscall. */ regs->ax = 0; } syscall_exit_to_user_mode(regs); } int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long sp = args->stack; unsigned long tls = args->tls; struct inactive_task_frame *frame; struct fork_frame *fork_frame; struct pt_regs *childregs; unsigned long new_ssp; int ret = 0; childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; frame->bp = encode_frame_pointer(childregs); frame->ret_addr = (unsigned long) ret_from_fork_asm; p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap = NULL; p->thread.iopl_warn = 0; memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); #ifdef CONFIG_X86_64 current_save_fsgs(); p->thread.fsindex = current->thread.fsindex; p->thread.fsbase = current->thread.fsbase; p->thread.gsindex = current->thread.gsindex; p->thread.gsbase = current->thread.gsbase; savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); if (p->mm && (clone_flags & (CLONE_VM | CLONE_VFORK)) == CLONE_VM) set_bit(MM_CONTEXT_LOCK_LAM, &p->mm->context.flags); #else p->thread.sp0 = (unsigned long) (childregs + 1); savesegment(gs, p->thread.gs); /* * Clear all status flags including IF and set fixed bit. 64bit * does not have this initialization as the frame does not contain * flags. The flags consistency (especially vs. AC) is there * ensured via objtool, which lacks 32bit support. */ frame->flags = X86_EFLAGS_FIXED; #endif /* * Allocate a new shadow stack for thread if needed. If shadow stack, * is disabled, new_ssp will remain 0, and fpu_clone() will know not to * update it. */ new_ssp = shstk_alloc_thread_stack(p, clone_flags, args->stack_size); if (IS_ERR_VALUE(new_ssp)) return PTR_ERR((void *)new_ssp); fpu_clone(p, clone_flags, args->fn, new_ssp); /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { p->thread.pkru = pkru_get_init_value(); memset(childregs, 0, sizeof(struct pt_regs)); kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } /* * Clone current's PKRU value from hardware. tsk->thread.pkru * is only valid when scheduled out. */ p->thread.pkru = read_pkru(); frame->bx = 0; *childregs = *current_pt_regs(); childregs->ax = 0; if (sp) childregs->sp = sp; if (unlikely(args->fn)) { /* * A user space thread, but it doesn't return to * ret_after_fork(). * * In order to indicate that to tools like gdb, * we reset the stack and instruction pointers. * * It does the same kernel frame setup to return to a kernel * function that a kernel thread does. */ childregs->sp = 0; childregs->ip = 0; kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } /* Set a new TLS for the child thread? */ if (clone_flags & CLONE_SETTLS) ret = set_new_tls(p, tls); if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP))) io_bitmap_share(p); /* * If copy_thread() if failing, don't leak the shadow stack possibly * allocated in shstk_alloc_thread_stack() above. */ if (ret) shstk_free(p); return ret; } static void pkru_flush_thread(void) { /* * If PKRU is enabled the default PKRU value has to be loaded into * the hardware right here (similar to context switch). */ pkru_write_default(); } void flush_thread(void) { struct task_struct *tsk = current; flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); fpu_flush_thread(); pkru_flush_thread(); } void disable_TSC(void) { preempt_disable(); if (!test_and_set_thread_flag(TIF_NOTSC)) /* * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ cr4_set_bits(X86_CR4_TSD); preempt_enable(); } static void enable_TSC(void) { preempt_disable(); if (test_and_clear_thread_flag(TIF_NOTSC)) /* * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ cr4_clear_bits(X86_CR4_TSD); preempt_enable(); } int get_tsc_mode(unsigned long adr) { unsigned int val; if (test_thread_flag(TIF_NOTSC)) val = PR_TSC_SIGSEGV; else val = PR_TSC_ENABLE; return put_user(val, (unsigned int __user *)adr); } int set_tsc_mode(unsigned int val) { if (val == PR_TSC_SIGSEGV) disable_TSC(); else if (val == PR_TSC_ENABLE) enable_TSC(); else return -EINVAL; return 0; } DEFINE_PER_CPU(u64, msr_misc_features_shadow); static void set_cpuid_faulting(bool on) { u64 msrval; msrval = this_cpu_read(msr_misc_features_shadow); msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); this_cpu_write(msr_misc_features_shadow, msrval); wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); } static void disable_cpuid(void) { preempt_disable(); if (!test_and_set_thread_flag(TIF_NOCPUID)) { /* * Must flip the CPU state synchronously with * TIF_NOCPUID in the current running context. */ set_cpuid_faulting(true); } preempt_enable(); } static void enable_cpuid(void) { preempt_disable(); if (test_and_clear_thread_flag(TIF_NOCPUID)) { /* * Must flip the CPU state synchronously with * TIF_NOCPUID in the current running context. */ set_cpuid_faulting(false); } preempt_enable(); } static int get_cpuid_mode(void) { return !test_thread_flag(TIF_NOCPUID); } static int set_cpuid_mode(unsigned long cpuid_enabled) { if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) return -ENODEV; if (cpuid_enabled) enable_cpuid(); else disable_cpuid(); return 0; } /* * Called immediately after a successful exec. */ void arch_setup_new_exec(void) { /* If cpuid was previously disabled for this task, re-enable it. */ if (test_thread_flag(TIF_NOCPUID)) enable_cpuid(); /* * Don't inherit TIF_SSBD across exec boundary when * PR_SPEC_DISABLE_NOEXEC is used. */ if (test_thread_flag(TIF_SSBD) && task_spec_ssb_noexec(current)) { clear_thread_flag(TIF_SSBD); task_clear_spec_ssb_disable(current); task_clear_spec_ssb_noexec(current); speculation_ctrl_update(read_thread_flags()); } mm_reset_untag_mask(current->mm); } #ifdef CONFIG_X86_IOPL_IOPERM static inline void switch_to_bitmap(unsigned long tifp) { /* * Invalidate I/O bitmap if the previous task used it. This prevents * any possible leakage of an active I/O bitmap. * * If the next task has an I/O bitmap it will handle it on exit to * user mode. */ if (tifp & _TIF_IO_BITMAP) tss_invalidate_io_bitmap(); } static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm) { /* * Copy at least the byte range of the incoming tasks bitmap which * covers the permitted I/O ports. * * If the previous task which used an I/O bitmap had more bits * permitted, then the copy needs to cover those as well so they * get turned off. */ memcpy(tss->io_bitmap.bitmap, iobm->bitmap, max(tss->io_bitmap.prev_max, iobm->max)); /* * Store the new max and the sequence number of this bitmap * and a pointer to the bitmap itself. */ tss->io_bitmap.prev_max = iobm->max; tss->io_bitmap.prev_sequence = iobm->sequence; } /** * native_tss_update_io_bitmap - Update I/O bitmap before exiting to user mode */ void native_tss_update_io_bitmap(void) { struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw); struct thread_struct *t = &current->thread; u16 *base = &tss->x86_tss.io_bitmap_base; if (!test_thread_flag(TIF_IO_BITMAP)) { native_tss_invalidate_io_bitmap(); return; } if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) { *base = IO_BITMAP_OFFSET_VALID_ALL; } else { struct io_bitmap *iobm = t->io_bitmap; /* * Only copy bitmap data when the sequence number differs. The * update time is accounted to the incoming task. */ if (tss->io_bitmap.prev_sequence != iobm->sequence) tss_copy_io_bitmap(tss, iobm); /* Enable the bitmap */ *base = IO_BITMAP_OFFSET_VALID_MAP; } /* * Make sure that the TSS limit is covering the IO bitmap. It might have * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O * access from user space to trigger a #GP because tbe bitmap is outside * the TSS limit. */ refresh_tss_limit(); } #else /* CONFIG_X86_IOPL_IOPERM */ static inline void switch_to_bitmap(unsigned long tifp) { } #endif #ifdef CONFIG_SMP struct ssb_state { struct ssb_state *shared_state; raw_spinlock_t lock; unsigned int disable_state; unsigned long local_state; }; #define LSTATE_SSB 0 static DEFINE_PER_CPU(struct ssb_state, ssb_state); void speculative_store_bypass_ht_init(void) { struct ssb_state *st = this_cpu_ptr(&ssb_state); unsigned int this_cpu = smp_processor_id(); unsigned int cpu; st->local_state = 0; /* * Shared state setup happens once on the first bringup * of the CPU. It's not destroyed on CPU hotunplug. */ if (st->shared_state) return; raw_spin_lock_init(&st->lock); /* * Go over HT siblings and check whether one of them has set up the * shared state pointer already. */ for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) { if (cpu == this_cpu) continue; if (!per_cpu(ssb_state, cpu).shared_state) continue; /* Link it to the state of the sibling: */ st->shared_state = per_cpu(ssb_state, cpu).shared_state; return; } /* * First HT sibling to come up on the core. Link shared state of * the first HT sibling to itself. The siblings on the same core * which come up later will see the shared state pointer and link * themselves to the state of this CPU. */ st->shared_state = st; } /* * Logic is: First HT sibling enables SSBD for both siblings in the core * and last sibling to disable it, disables it for the whole core. This how * MSR_SPEC_CTRL works in "hardware": * * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL */ static __always_inline void amd_set_core_ssb_state(unsigned long tifn) { struct ssb_state *st = this_cpu_ptr(&ssb_state); u64 msr = x86_amd_ls_cfg_base; if (!static_cpu_has(X86_FEATURE_ZEN)) { msr |= ssbd_tif_to_amd_ls_cfg(tifn); wrmsrl(MSR_AMD64_LS_CFG, msr); return; } if (tifn & _TIF_SSBD) { /* * Since this can race with prctl(), block reentry on the * same CPU. */ if (__test_and_set_bit(LSTATE_SSB, &st->local_state)) return; msr |= x86_amd_ls_cfg_ssbd_mask; raw_spin_lock(&st->shared_state->lock); /* First sibling enables SSBD: */ if (!st->shared_state->disable_state) wrmsrl(MSR_AMD64_LS_CFG, msr); st->shared_state->disable_state++; raw_spin_unlock(&st->shared_state->lock); } else { if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state)) return; raw_spin_lock(&st->shared_state->lock); st->shared_state->disable_state--; if (!st->shared_state->disable_state) wrmsrl(MSR_AMD64_LS_CFG, msr); raw_spin_unlock(&st->shared_state->lock); } } #else static __always_inline void amd_set_core_ssb_state(unsigned long tifn) { u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn); wrmsrl(MSR_AMD64_LS_CFG, msr); } #endif static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) { /* * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL, * so ssbd_tif_to_spec_ctrl() just works. */ wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); } /* * Update the MSRs managing speculation control, during context switch. * * tifp: Previous task's thread flags * tifn: Next task's thread flags */ static __always_inline void __speculation_ctrl_update(unsigned long tifp, unsigned long tifn) { unsigned long tif_diff = tifp ^ tifn; u64 msr = x86_spec_ctrl_base; bool updmsr = false; lockdep_assert_irqs_disabled(); /* Handle change of TIF_SSBD depending on the mitigation method. */ if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) { if (tif_diff & _TIF_SSBD) amd_set_ssb_virt_state(tifn); } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) { if (tif_diff & _TIF_SSBD) amd_set_core_ssb_state(tifn); } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || static_cpu_has(X86_FEATURE_AMD_SSBD)) { updmsr |= !!(tif_diff & _TIF_SSBD); msr |= ssbd_tif_to_spec_ctrl(tifn); } /* Only evaluate TIF_SPEC_IB if conditional STIBP is enabled. */ if (IS_ENABLED(CONFIG_SMP) && static_branch_unlikely(&switch_to_cond_stibp)) { updmsr |= !!(tif_diff & _TIF_SPEC_IB); msr |= stibp_tif_to_spec_ctrl(tifn); } if (updmsr) update_spec_ctrl_cond(msr); } static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) { if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) { if (task_spec_ssb_disable(tsk)) set_tsk_thread_flag(tsk, TIF_SSBD); else clear_tsk_thread_flag(tsk, TIF_SSBD); if (task_spec_ib_disable(tsk)) set_tsk_thread_flag(tsk, TIF_SPEC_IB); else clear_tsk_thread_flag(tsk, TIF_SPEC_IB); } /* Return the updated threadinfo flags*/ return read_task_thread_flags(tsk); } void speculation_ctrl_update(unsigned long tif) { unsigned long flags; /* Forced update. Make sure all relevant TIF flags are different */ local_irq_save(flags); __speculation_ctrl_update(~tif, tif); local_irq_restore(flags); } /* Called from seccomp/prctl update */ void speculation_ctrl_update_current(void) { preempt_disable(); speculation_ctrl_update(speculation_ctrl_update_tif(current)); preempt_enable(); } static inline void cr4_toggle_bits_irqsoff(unsigned long mask) { unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); newval = cr4 ^ mask; if (newval != cr4) { this_cpu_write(cpu_tlbstate.cr4, newval); __write_cr4(newval); } } void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { unsigned long tifp, tifn; tifn = read_task_thread_flags(next_p); tifp = read_task_thread_flags(prev_p); switch_to_bitmap(tifp); propagate_user_return_notify(prev_p, next_p); if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && arch_has_block_step()) { unsigned long debugctl, msk; rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl &= ~DEBUGCTLMSR_BTF; msk = tifn & _TIF_BLOCKSTEP; debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); } if ((tifp ^ tifn) & _TIF_NOTSC) cr4_toggle_bits_irqsoff(X86_CR4_TSD); if ((tifp ^ tifn) & _TIF_NOCPUID) set_cpuid_faulting(!!(tifn & _TIF_NOCPUID)); if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) { __speculation_ctrl_update(tifp, tifn); } else { speculation_ctrl_update_tif(prev_p); tifn = speculation_ctrl_update_tif(next_p); /* Enforce MSR update to ensure consistent state */ __speculation_ctrl_update(~tifn, tifn); } } /* * Idle related variables and functions */ unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; EXPORT_SYMBOL(boot_option_idle_override); /* * We use this if we don't have any better idle routine.. */ void __cpuidle default_idle(void) { raw_safe_halt(); raw_local_irq_disable(); } #if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) EXPORT_SYMBOL(default_idle); #endif DEFINE_STATIC_CALL_NULL(x86_idle, default_idle); static bool x86_idle_set(void) { return !!static_call_query(x86_idle); } #ifndef CONFIG_SMP static inline void __noreturn play_dead(void) { BUG(); } #endif void arch_cpu_idle_enter(void) { tsc_verify_tsc_adjust(false); local_touch_nmi(); } void __noreturn arch_cpu_idle_dead(void) { play_dead(); } /* * Called from the generic idle code. */ void __cpuidle arch_cpu_idle(void) { static_call(x86_idle)(); } EXPORT_SYMBOL_GPL(arch_cpu_idle); #ifdef CONFIG_XEN bool xen_set_default_idle(void) { bool ret = x86_idle_set(); static_call_update(x86_idle, default_idle); return ret; } #endif struct cpumask cpus_stop_mask; void __noreturn stop_this_cpu(void *dummy) { struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info); unsigned int cpu = smp_processor_id(); local_irq_disable(); /* * Remove this CPU from the online mask and disable it * unconditionally. This might be redundant in case that the reboot * vector was handled late and stop_other_cpus() sent an NMI. * * According to SDM and APM NMIs can be accepted even after soft * disabling the local APIC. */ set_cpu_online(cpu, false); disable_local_APIC(); mcheck_cpu_clear(c); /* * Use wbinvd on processors that support SME. This provides support * for performing a successful kexec when going from SME inactive * to SME active (or vice-versa). The cache must be cleared so that * if there are entries with the same physical address, both with and * without the encryption bit, they don't race each other when flushed * and potentially end up with the wrong entry being committed to * memory. * * Test the CPUID bit directly because the machine might've cleared * X86_FEATURE_SME due to cmdline options. */ if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0))) native_wbinvd(); /* * This brings a cache line back and dirties it, but * native_stop_other_cpus() will overwrite cpus_stop_mask after it * observed that all CPUs reported stop. This write will invalidate * the related cache line on this CPU. */ cpumask_clear_cpu(cpu, &cpus_stop_mask); for (;;) { /* * Use native_halt() so that memory contents don't change * (stack usage and variables) after possibly issuing the * native_wbinvd() above. */ native_halt(); } } /* * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power * states (local apic timer and TSC stop). * * XXX this function is completely buggered vs RCU and tracing. */ static void amd_e400_idle(void) { /* * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E * gets set after static_cpu_has() places have been converted via * alternatives. */ if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { default_idle(); return; } tick_broadcast_enter(); default_idle(); tick_broadcast_exit(); } /* * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf * exists and whenever MONITOR/MWAIT extensions are present there is at * least one C1 substate. * * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait * is passed to kernel commandline parameter. */ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; /* User has disallowed the use of MWAIT. Fallback to HALT */ if (boot_option_idle_override == IDLE_NOMWAIT) return 0; /* MWAIT is not supported on this platform. Fallback to HALT */ if (!cpu_has(c, X86_FEATURE_MWAIT)) return 0; /* Monitor has a bug. Fallback to HALT */ if (boot_cpu_has_bug(X86_BUG_MONITOR)) return 0; cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); /* * If MWAIT extensions are not available, it is safe to use MWAIT * with EAX=0, ECX=0. */ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) return 1; /* * If MWAIT extensions are available, there should be at least one * MWAIT C1 substate present. */ return (edx & MWAIT_C1_SUBSTATE_MASK); } /* * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT * with interrupts enabled and no flags, which is backwards compatible with the * original MWAIT implementation. */ static __cpuidle void mwait_idle(void) { if (!current_set_polling_and_test()) { if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { mb(); /* quirk */ clflush((void *)&current_thread_info()->flags); mb(); /* quirk */ } __monitor((void *)&current_thread_info()->flags, 0, 0); if (!need_resched()) { __sti_mwait(0, 0); raw_local_irq_disable(); } } __current_clr_polling(); } void select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); #endif if (x86_idle_set() || boot_option_idle_override == IDLE_POLL) return; if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { pr_info("using AMD E400 aware idle routine\n"); static_call_update(x86_idle, amd_e400_idle); } else if (prefer_mwait_c1_over_halt(c)) { pr_info("using mwait in idle threads\n"); static_call_update(x86_idle, mwait_idle); } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { pr_info("using TDX aware idle routine\n"); static_call_update(x86_idle, tdx_safe_halt); } else static_call_update(x86_idle, default_idle); } void amd_e400_c1e_apic_setup(void) { if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id()); local_irq_disable(); tick_broadcast_force(); local_irq_enable(); } } void __init arch_post_acpi_subsys_init(void) { u32 lo, hi; if (!boot_cpu_has_bug(X86_BUG_AMD_E400)) return; /* * AMD E400 detection needs to happen after ACPI has been enabled. If * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in * MSR_K8_INT_PENDING_MSG. */ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (!(lo & K8_INTP_C1E_ACTIVE_MASK)) return; boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E); if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); pr_info("System has AMD C1E enabled\n"); } static int __init idle_setup(char *str) { if (!str) return -EINVAL; if (!strcmp(str, "poll")) { pr_info("using polling idle threads\n"); boot_option_idle_override = IDLE_POLL; cpu_idle_poll_ctrl(true); } else if (!strcmp(str, "halt")) { /* * When the boot option of idle=halt is added, halt is * forced to be used for CPU idle. In such case CPU C2/C3 * won't be used again. * To continue to load the CPU idle driver, don't touch * the boot_option_idle_override. */ static_call_update(x86_idle, default_idle); boot_option_idle_override = IDLE_HALT; } else if (!strcmp(str, "nomwait")) { /* * If the boot option of "idle=nomwait" is added, * it means that mwait will be disabled for CPU C1/C2/C3 * states. */ boot_option_idle_override = IDLE_NOMWAIT; } else return -1; return 0; } early_param("idle", idle_setup); unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_u32_below(8192); return sp & ~0xf; } unsigned long arch_randomize_brk(struct mm_struct *mm) { return randomize_page(mm->brk, 0x02000000); } /* * Called from fs/proc with a reference on @p to find the function * which called into schedule(). This needs to be done carefully * because the task might wake up and we might look at a stack * changing under us. */ unsigned long __get_wchan(struct task_struct *p) { struct unwind_state state; unsigned long addr = 0; if (!try_get_task_stack(p)) return 0; for (unwind_start(&state, p, NULL, NULL); !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); if (!addr) break; if (in_sched_functions(addr)) continue; break; } put_task_stack(p); return addr; } long do_arch_prctl_common(int option, unsigned long arg2) { switch (option) { case ARCH_GET_CPUID: return get_cpuid_mode(); case ARCH_SET_CPUID: return set_cpuid_mode(arg2); case ARCH_GET_XCOMP_SUPP: case ARCH_GET_XCOMP_PERM: case ARCH_REQ_XCOMP_PERM: case ARCH_GET_XCOMP_GUEST_PERM: case ARCH_REQ_XCOMP_GUEST_PERM: return fpu_xstate_prctl(option, arg2); } return -EINVAL; }
412 411 412 412 412 402 402 149 2 147 147 4 147 146 146 2 111 111 110 2744 2712 425 21 425 400 400 400 398 397 147 146 397 396 4 4 397 138 397 397 389 123 142 144 143 143 142 1698 137 137 111 113 113 17 111 53 51 67 137 467 467 465 466 439 466 1763 1762 500 466 467 467 467 467 467 466 466 465 1185 87 87 47 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 // SPDX-License-Identifier: GPL-2.0-only /* * mm/truncate.c - code for taking down pages from address_spaces * * Copyright (C) 2002, Linus Torvalds * * 10Sep2002 Andrew Morton * Initial version. */ #include <linux/kernel.h> #include <linux/backing-dev.h> #include <linux/dax.h> #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/export.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/pagevec.h> #include <linux/task_io_accounting_ops.h> #include <linux/shmem_fs.h> #include <linux/rmap.h> #include "internal.h" /* * Regular page slots are stabilized by the page lock even without the tree * itself locked. These unlocked entries need verification under the tree * lock. */ static inline void __clear_shadow_entry(struct address_space *mapping, pgoff_t index, void *entry) { XA_STATE(xas, &mapping->i_pages, index); xas_set_update(&xas, workingset_update_node); if (xas_load(&xas) != entry) return; xas_store(&xas, NULL); } static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, void *entry) { spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); __clear_shadow_entry(mapping, index, entry); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); } /* * Unconditionally remove exceptional entries. Usually called from truncate * path. Note that the folio_batch may be altered by this function by removing * exceptional entries similar to what folio_batch_remove_exceptionals() does. */ static void truncate_folio_batch_exceptionals(struct address_space *mapping, struct folio_batch *fbatch, pgoff_t *indices) { int i, j; bool dax; /* Handled by shmem itself */ if (shmem_mapping(mapping)) return; for (j = 0; j < folio_batch_count(fbatch); j++) if (xa_is_value(fbatch->folios[j])) break; if (j == folio_batch_count(fbatch)) return; dax = dax_mapping(mapping); if (!dax) { spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); } for (i = j; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; pgoff_t index = indices[i]; if (!xa_is_value(folio)) { fbatch->folios[j++] = folio; continue; } if (unlikely(dax)) { dax_delete_mapping_entry(mapping, index); continue; } __clear_shadow_entry(mapping, index, folio); } if (!dax) { xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); } fbatch->nr = j; } /* * Invalidate exceptional entry if easily possible. This handles exceptional * entries for invalidate_inode_pages(). */ static int invalidate_exceptional_entry(struct address_space *mapping, pgoff_t index, void *entry) { /* Handled by shmem itself, or for DAX we do nothing. */ if (shmem_mapping(mapping) || dax_mapping(mapping)) return 1; clear_shadow_entry(mapping, index, entry); return 1; } /* * Invalidate exceptional entry if clean. This handles exceptional entries for * invalidate_inode_pages2() so for DAX it evicts only clean entries. */ static int invalidate_exceptional_entry2(struct address_space *mapping, pgoff_t index, void *entry) { /* Handled by shmem itself */ if (shmem_mapping(mapping)) return 1; if (dax_mapping(mapping)) return dax_invalidate_mapping_entry_sync(mapping, index); clear_shadow_entry(mapping, index, entry); return 1; } /** * folio_invalidate - Invalidate part or all of a folio. * @folio: The folio which is affected. * @offset: start of the range to invalidate * @length: length of the range to invalidate * * folio_invalidate() is called when all or part of the folio has become * invalidated by a truncate operation. * * folio_invalidate() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O * is underway against any of the blocks which are outside the truncation * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */ void folio_invalidate(struct folio *folio, size_t offset, size_t length) { const struct address_space_operations *aops = folio->mapping->a_ops; if (aops->invalidate_folio) aops->invalidate_folio(folio, offset, length); } EXPORT_SYMBOL_GPL(folio_invalidate); /* * If truncate cannot remove the fs-private metadata from the page, the page * becomes orphaned. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_fault(). * * We need to bail out if page->mapping is no longer equal to the original * mapping. This happens a) when the VM reclaimed the page while we waited on * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ static void truncate_cleanup_folio(struct folio *folio) { if (folio_mapped(folio)) unmap_mapping_folio(folio); if (folio_has_private(folio)) folio_invalidate(folio, 0, folio_size(folio)); /* * Some filesystems seem to re-dirty the page even after * the VM has canceled the dirty bit (eg ext3 journaling). * Hence dirty accounting check is placed after invalidation. */ folio_cancel_dirty(folio); folio_clear_mappedtodisk(folio); } int truncate_inode_folio(struct address_space *mapping, struct folio *folio) { if (folio->mapping != mapping) return -EIO; truncate_cleanup_folio(folio); filemap_remove_folio(folio); return 0; } /* * Handle partial folios. The folio may be entirely within the * range if a split has raced with us. If not, we zero the part of the * folio that's within the [start, end] range, and then split the folio if * it's large. split_page_range() will discard pages which now lie beyond * i_size, and we rely on the caller to discard pages which lie within a * newly created hole. * * Returns false if splitting failed so the caller can avoid * discarding the entire folio which is stubbornly unsplit. */ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) { loff_t pos = folio_pos(folio); unsigned int offset, length; if (pos < start) offset = start - pos; else offset = 0; length = folio_size(folio); if (pos + length <= (u64)end) length = length - offset; else length = end + 1 - pos - offset; folio_wait_writeback(folio); if (length == folio_size(folio)) { truncate_inode_folio(folio->mapping, folio); return true; } /* * We may be zeroing pages we're about to discard, but it avoids * doing a complex calculation here, and then doing the zeroing * anyway if the page split fails. */ folio_zero_range(folio, offset, length); if (folio_has_private(folio)) folio_invalidate(folio, offset, length); if (!folio_test_large(folio)) return true; if (split_folio(folio) == 0) return true; if (folio_test_dirty(folio)) return false; truncate_inode_folio(folio->mapping, folio); return true; } /* * Used to get rid of pages on hardware memory corruption. */ int generic_error_remove_page(struct address_space *mapping, struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); if (!mapping) return -EINVAL; /* * Only punch for normal data pages for now. * Handling other types like directories would need more auditing. */ if (!S_ISREG(mapping->host->i_mode)) return -EIO; return truncate_inode_folio(mapping, page_folio(page)); } EXPORT_SYMBOL(generic_error_remove_page); static long mapping_evict_folio(struct address_space *mapping, struct folio *folio) { if (folio_test_dirty(folio) || folio_test_writeback(folio)) return 0; /* The refcount will be elevated if any page in the folio is mapped */ if (folio_ref_count(folio) > folio_nr_pages(folio) + folio_has_private(folio) + 1) return 0; if (!filemap_release_folio(folio, 0)) return 0; return remove_mapping(mapping, folio); } /** * invalidate_inode_page() - Remove an unused page from the pagecache. * @page: The page to remove. * * Safely invalidate one page from its pagecache mapping. * It only drops clean, unused pages. * * Context: Page must be locked. * Return: The number of pages successfully removed. */ long invalidate_inode_page(struct page *page) { struct folio *folio = page_folio(page); struct address_space *mapping = folio_mapping(folio); /* The page may have been truncated before it was locked */ if (!mapping) return 0; return mapping_evict_folio(mapping, folio); } /** * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate * @lstart: offset from which to truncate * @lend: offset to which to truncate (inclusive) * * Truncate the page cache, removing the pages that are between * specified offsets (and zeroing out partial pages * if lstart or lend + 1 is not page aligned). * * Truncate takes two passes - the first pass is nonblocking. It will not * block on page locks and it will not block on writeback. The second pass * will wait. This is to prevent as much IO as possible in the affected region. * The first pass will remove most pages, so the search cost of the second pass * is low. * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. * * Note that since ->invalidate_folio() accepts range to invalidate * truncate_inode_pages_range is able to handle cases where lend + 1 is not * page aligned properly. */ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { pgoff_t start; /* inclusive */ pgoff_t end; /* exclusive */ struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; struct folio *folio; bool same_folio; if (mapping_empty(mapping)) return; /* * 'start' and 'end' always covers the range of pages to be fully * truncated. Partial pages are covered with 'partial_start' at the * start of the range and 'partial_end' at the end of the range. * Note that 'end' is exclusive while 'lend' is inclusive. */ start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; if (lend == -1) /* * lend == -1 indicates end-of-file so we have to set 'end' * to the highest possible pgoff_t and since the type is * unsigned we're using -1. */ end = -1; else end = (lend + 1) >> PAGE_SHIFT; folio_batch_init(&fbatch); index = start; while (index < end && find_lock_entries(mapping, &index, end - 1, &fbatch, indices)) { truncate_folio_batch_exceptionals(mapping, &fbatch, indices); for (i = 0; i < folio_batch_count(&fbatch); i++) truncate_cleanup_folio(fbatch.folios[i]); delete_from_page_cache_batch(mapping, &fbatch); for (i = 0; i < folio_batch_count(&fbatch); i++) folio_unlock(fbatch.folios[i]); folio_batch_release(&fbatch); cond_resched(); } same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0); if (!IS_ERR(folio)) { same_folio = lend < folio_pos(folio) + folio_size(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { start = folio_next_index(folio); if (same_folio) end = folio->index; } folio_unlock(folio); folio_put(folio); folio = NULL; } if (!same_folio) { folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT, FGP_LOCK, 0); if (!IS_ERR(folio)) { if (!truncate_inode_partial_folio(folio, lstart, lend)) end = folio->index; folio_unlock(folio); folio_put(folio); } } index = start; while (index < end) { cond_resched(); if (!find_get_entries(mapping, &index, end - 1, &fbatch, indices)) { /* If all gone from start onwards, we're done */ if (index == start) break; /* Otherwise restart to make sure all gone */ index = start; continue; } for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing page->index */ if (xa_is_value(folio)) continue; folio_lock(folio); VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); truncate_inode_folio(mapping, folio); folio_unlock(folio); } truncate_folio_batch_exceptionals(mapping, &fbatch, indices); folio_batch_release(&fbatch); } } EXPORT_SYMBOL(truncate_inode_pages_range); /** * truncate_inode_pages - truncate *all* the pages from an offset * @mapping: mapping to truncate * @lstart: offset from which to truncate * * Called under (and serialised by) inode->i_rwsem and * mapping->invalidate_lock. * * Note: When this function returns, there can be a page in the process of * deletion (inside __filemap_remove_folio()) in the specified range. Thus * mapping->nrpages can be non-zero when this function returns even after * truncation of the whole mapping. */ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) { truncate_inode_pages_range(mapping, lstart, (loff_t)-1); } EXPORT_SYMBOL(truncate_inode_pages); /** * truncate_inode_pages_final - truncate *all* pages before inode dies * @mapping: mapping to truncate * * Called under (and serialized by) inode->i_rwsem. * * Filesystems have to use this in the .evict_inode path to inform the * VM that this is the final truncate and the inode is going away. */ void truncate_inode_pages_final(struct address_space *mapping) { /* * Page reclaim can not participate in regular inode lifetime * management (can't call iput()) and thus can race with the * inode teardown. Tell it when the address space is exiting, * so that it does not install eviction information after the * final truncate has begun. */ mapping_set_exiting(mapping); if (!mapping_empty(mapping)) { /* * As truncation uses a lockless tree lookup, cycle * the tree lock to make sure any ongoing tree * modification that does not see AS_EXITING is * completed before starting the final truncate. */ xa_lock_irq(&mapping->i_pages); xa_unlock_irq(&mapping->i_pages); } truncate_inode_pages(mapping, 0); } EXPORT_SYMBOL(truncate_inode_pages_final); /** * mapping_try_invalidate - Invalidate all the evictable folios of one inode * @mapping: the address_space which holds the folios to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * @nr_failed: How many folio invalidations failed * * This function is similar to invalidate_mapping_pages(), except that it * returns the number of folios which could not be evicted in @nr_failed. */ unsigned long mapping_try_invalidate(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_failed) { pgoff_t indices[PAGEVEC_SIZE]; struct folio_batch fbatch; pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; folio_batch_init(&fbatch); while (find_lock_entries(mapping, &index, end, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ if (xa_is_value(folio)) { count += invalidate_exceptional_entry(mapping, indices[i], folio); continue; } ret = mapping_evict_folio(mapping, folio); folio_unlock(folio); /* * Invalidation is a hint that the folio is no longer * of interest and try to speed up its reclaim. */ if (!ret) { deactivate_file_folio(folio); /* Likely in the lru cache of a remote CPU */ if (nr_failed) (*nr_failed)++; } count += ret; } folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); } return count; } /** * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode * @mapping: the address_space which holds the cache to invalidate * @start: the offset 'from' which to invalidate * @end: the offset 'to' which to invalidate (inclusive) * * This function removes pages that are clean, unmapped and unlocked, * as well as shadow entries. It will not block on IO activity. * * If you want to remove all the pages of one inode, regardless of * their use and writeback state, use truncate_inode_pages(). * * Return: The number of indices that had their contents invalidated */ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { return mapping_try_invalidate(mapping, start, end, NULL); } EXPORT_SYMBOL(invalidate_mapping_pages); /* * This is like invalidate_inode_page(), except it ignores the page's * refcount. We do this because invalidate_inode_pages2() needs stronger * invalidation guarantees, and cannot afford to leave pages behind because * shrink_page_list() has a temp ref on them, or because they're transiently * sitting in the folio_add_lru() caches. */ static int invalidate_complete_folio2(struct address_space *mapping, struct folio *folio) { if (folio->mapping != mapping) return 0; if (!filemap_release_folio(folio, GFP_KERNEL)) return 0; spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); if (folio_test_dirty(folio)) goto failed; BUG_ON(folio_has_private(folio)); __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); return 1; failed: xa_unlock_irq(&mapping->i_pages); spin_unlock(&mapping->host->i_lock); return 0; } static int folio_launder(struct address_space *mapping, struct folio *folio) { if (!folio_test_dirty(folio)) return 0; if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL) return 0; return mapping->a_ops->launder_folio(folio); } /** * invalidate_inode_pages2_range - remove range of pages from an address_space * @mapping: the address_space * @start: the page offset 'from' which to invalidate * @end: the page offset 'to' which to invalidate (inclusive) * * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * * Return: -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { pgoff_t indices[PAGEVEC_SIZE]; struct folio_batch fbatch; pgoff_t index; int i; int ret = 0; int ret2 = 0; int did_range_unmap = 0; if (mapping_empty(mapping)) return 0; folio_batch_init(&fbatch); index = start; while (find_get_entries(mapping, &index, end, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ if (xa_is_value(folio)) { if (!invalidate_exceptional_entry2(mapping, indices[i], folio)) ret = -EBUSY; continue; } if (!did_range_unmap && folio_mapped(folio)) { /* * If folio is mapped, before taking its lock, * zap the rest of the file in one hit. */ unmap_mapping_pages(mapping, indices[i], (1 + end - indices[i]), false); did_range_unmap = 1; } folio_lock(folio); if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); continue; } VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); if (folio_mapped(folio)) unmap_mapping_folio(folio); BUG_ON(folio_mapped(folio)); ret2 = folio_launder(mapping, folio); if (ret2 == 0) { if (!invalidate_complete_folio2(mapping, folio)) ret2 = -EBUSY; } if (ret2 < 0) ret = ret2; folio_unlock(folio); } folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); } /* * For DAX we invalidate page tables after invalidating page cache. We * could invalidate page tables while invalidating each entry however * that would be expensive. And doing range unmapping before doesn't * work as we have no cheap way to find whether page cache entry didn't * get remapped later. */ if (dax_mapping(mapping)) { unmap_mapping_pages(mapping, start, end - start + 1, false); } return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); /** * invalidate_inode_pages2 - remove all pages from an address_space * @mapping: the address_space * * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * * Return: -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2(struct address_space *mapping) { return invalidate_inode_pages2_range(mapping, 0, -1); } EXPORT_SYMBOL_GPL(invalidate_inode_pages2); /** * truncate_pagecache - unmap and remove pagecache that has been truncated * @inode: inode * @newsize: new file size * * inode's new i_size must already be written before truncate_pagecache * is called. * * This function should typically be called before the filesystem * releases resources associated with the freed range (eg. deallocates * blocks). This way, pagecache will always stay logically coherent * with on-disk format, and the filesystem would not have to deal with * situations such as writepage being called for a page that has already * had its underlying blocks deallocated. */ void truncate_pagecache(struct inode *inode, loff_t newsize) { struct address_space *mapping = inode->i_mapping; loff_t holebegin = round_up(newsize, PAGE_SIZE); /* * unmap_mapping_range is called twice, first simply for * efficiency so that truncate_inode_pages does fewer * single-page unmaps. However after this first call, and * before truncate_inode_pages finishes, it is possible for * private pages to be COWed, which remain after * truncate_inode_pages finishes, hence the second * unmap_mapping_range call must be made for correctness. */ unmap_mapping_range(mapping, holebegin, 0, 1); truncate_inode_pages(mapping, newsize); unmap_mapping_range(mapping, holebegin, 0, 1); } EXPORT_SYMBOL(truncate_pagecache); /** * truncate_setsize - update inode and pagecache for a new file size * @inode: inode * @newsize: new file size * * truncate_setsize updates i_size and performs pagecache truncation (if * necessary) to @newsize. It will be typically be called from the filesystem's * setattr function when ATTR_SIZE is passed in. * * Must be called with a lock serializing truncates and writes (generally * i_rwsem but e.g. xfs uses a different lock) and before all filesystem * specific block truncation has been performed. */ void truncate_setsize(struct inode *inode, loff_t newsize) { loff_t oldsize = inode->i_size; i_size_write(inode, newsize); if (newsize > oldsize) pagecache_isize_extended(inode, oldsize, newsize); truncate_pagecache(inode, newsize); } EXPORT_SYMBOL(truncate_setsize); /** * pagecache_isize_extended - update pagecache after extension of i_size * @inode: inode for which i_size was extended * @from: original inode size * @to: new inode size * * Handle extension of inode size either caused by extending truncate or by * write starting after current i_size. We mark the page straddling current * i_size RO so that page_mkwrite() is called on the nearest write access to * the page. This way filesystem can be sure that page_mkwrite() is called on * the page before user writes to the page via mmap after the i_size has been * changed. * * The function must be called after i_size is updated so that page fault * coming after we unlock the page will already see the new i_size. * The function must be called while we still hold i_rwsem - this not only * makes sure i_size is stable but also that userspace cannot observe new * i_size value before we are prepared to store mmap writes at new inode size. */ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) { int bsize = i_blocksize(inode); loff_t rounded_from; struct page *page; pgoff_t index; WARN_ON(to > inode->i_size); if (from >= to || bsize == PAGE_SIZE) return; /* Page straddling @from will not have any hole block created? */ rounded_from = round_up(from, bsize); if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1))) return; index = from >> PAGE_SHIFT; page = find_lock_page(inode->i_mapping, index); /* Page not cached? Nothing to do */ if (!page) return; /* * See clear_page_dirty_for_io() for details why set_page_dirty() * is needed. */ if (page_mkclean(page)) set_page_dirty(page); unlock_page(page); put_page(page); } EXPORT_SYMBOL(pagecache_isize_extended); /** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode * @lstart: offset of beginning of hole * @lend: offset of last byte of hole * * This function should typically be called before the filesystem * releases resources associated with the freed range (eg. deallocates * blocks). This way, pagecache will always stay logically coherent * with on-disk format, and the filesystem would not have to deal with * situations such as writepage being called for a page that has already * had its underlying blocks deallocated. */ void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) { struct address_space *mapping = inode->i_mapping; loff_t unmap_start = round_up(lstart, PAGE_SIZE); loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; /* * This rounding is currently just for example: unmap_mapping_range * expands its hole outwards, whereas we want it to contract the hole * inwards. However, existing callers of truncate_pagecache_range are * doing their own page rounding first. Note that unmap_mapping_range * allows holelen 0 for all, and we allow lend -1 for end of file. */ /* * Unlike in truncate_pagecache, unmap_mapping_range is called only * once (before truncating pagecache), and without "even_cows" flag: * hole-punching should not remove private COWed pages from the hole. */ if ((u64)unmap_end > (u64)unmap_start) unmap_mapping_range(mapping, unmap_start, 1 + unmap_end - unmap_start, 0); truncate_inode_pages_range(mapping, lstart, lend); } EXPORT_SYMBOL(truncate_pagecache_range);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vsyscall #if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define __VSYSCALL_TRACE_H #include <linux/tracepoint.h> TRACE_EVENT(emulate_vsyscall, TP_PROTO(int nr), TP_ARGS(nr), TP_STRUCT__entry(__field(int, nr)), TP_fast_assign( __entry->nr = nr; ), TP_printk("nr = %d", __entry->nr) ); #endif #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/ #define TRACE_INCLUDE_FILE vsyscall_trace #include <trace/define_trace.h>
1764 1764 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2007-2012 Siemens AG * * Written by: * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Sergey Lapin <slapin@ossfans.org> * Maxim Gorbachyov <maxim.gorbachev@siemens.com> * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ #include <linux/netdevice.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/ieee802154.h> #include <net/nl802154.h> #include <net/mac802154.h> #include <net/ieee802154_netdev.h> #include <net/cfg802154.h> #include "ieee802154_i.h" #include "driver-ops.h" int mac802154_wpan_update_llsec(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; int rc = 0; if (ops->llsec) { struct ieee802154_llsec_params params; int changed = 0; params.pan_id = wpan_dev->pan_id; changed |= IEEE802154_LLSEC_PARAM_PAN_ID; params.hwaddr = wpan_dev->extended_addr; changed |= IEEE802154_LLSEC_PARAM_HWADDR; rc = ops->llsec->set_params(dev, &params, changed); } return rc; } static int mac802154_wpan_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; struct sockaddr_ieee802154 *sa = (struct sockaddr_ieee802154 *)&ifr->ifr_addr; int err = -ENOIOCTLCMD; if (cmd != SIOCGIFADDR && cmd != SIOCSIFADDR) return err; rtnl_lock(); switch (cmd) { case SIOCGIFADDR: { u16 pan_id, short_addr; pan_id = le16_to_cpu(wpan_dev->pan_id); short_addr = le16_to_cpu(wpan_dev->short_addr); if (pan_id == IEEE802154_PANID_BROADCAST || short_addr == IEEE802154_ADDR_BROADCAST) { err = -EADDRNOTAVAIL; break; } sa->family = AF_IEEE802154; sa->addr.addr_type = IEEE802154_ADDR_SHORT; sa->addr.pan_id = pan_id; sa->addr.short_addr = short_addr; err = 0; break; } case SIOCSIFADDR: if (netif_running(dev)) { rtnl_unlock(); return -EBUSY; } dev_warn(&dev->dev, "Using DEBUGing ioctl SIOCSIFADDR isn't recommended!\n"); if (sa->family != AF_IEEE802154 || sa->addr.addr_type != IEEE802154_ADDR_SHORT || sa->addr.pan_id == IEEE802154_PANID_BROADCAST || sa->addr.short_addr == IEEE802154_ADDR_BROADCAST || sa->addr.short_addr == IEEE802154_ADDR_UNDEF) { err = -EINVAL; break; } wpan_dev->pan_id = cpu_to_le16(sa->addr.pan_id); wpan_dev->short_addr = cpu_to_le16(sa->addr.short_addr); err = mac802154_wpan_update_llsec(dev); break; } rtnl_unlock(); return err; } static int mac802154_wpan_mac_addr(struct net_device *dev, void *p) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct sockaddr *addr = p; __le64 extended_addr; if (netif_running(dev)) return -EBUSY; /* lowpan need to be down for update * SLAAC address after ifup */ if (sdata->wpan_dev.lowpan_dev) { if (netif_running(sdata->wpan_dev.lowpan_dev)) return -EBUSY; } ieee802154_be64_to_le64(&extended_addr, addr->sa_data); if (!ieee802154_is_valid_extended_unicast_addr(extended_addr)) return -EINVAL; dev_addr_set(dev, addr->sa_data); sdata->wpan_dev.extended_addr = extended_addr; /* update lowpan interface mac address when * wpan mac has been changed */ if (sdata->wpan_dev.lowpan_dev) dev_addr_set(sdata->wpan_dev.lowpan_dev, dev->dev_addr); return mac802154_wpan_update_llsec(dev); } static int ieee802154_setup_hw(struct ieee802154_sub_if_data *sdata) { struct ieee802154_local *local = sdata->local; struct wpan_dev *wpan_dev = &sdata->wpan_dev; int ret; sdata->required_filtering = sdata->iface_default_filtering; if (local->hw.flags & IEEE802154_HW_AFILT) { local->addr_filt.pan_id = wpan_dev->pan_id; local->addr_filt.ieee_addr = wpan_dev->extended_addr; local->addr_filt.short_addr = wpan_dev->short_addr; } if (local->hw.flags & IEEE802154_HW_LBT) { ret = drv_set_lbt_mode(local, wpan_dev->lbt); if (ret < 0) return ret; } if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) { ret = drv_set_csma_params(local, wpan_dev->min_be, wpan_dev->max_be, wpan_dev->csma_retries); if (ret < 0) return ret; } if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) { ret = drv_set_max_frame_retries(local, wpan_dev->frame_retries); if (ret < 0) return ret; } return 0; } static int mac802154_slave_open(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_local *local = sdata->local; int res; ASSERT_RTNL(); set_bit(SDATA_STATE_RUNNING, &sdata->state); if (!local->open_count) { res = ieee802154_setup_hw(sdata); if (res) goto err; res = drv_start(local, sdata->required_filtering, &local->addr_filt); if (res) goto err; } local->open_count++; netif_start_queue(dev); return 0; err: /* might already be clear but that doesn't matter */ clear_bit(SDATA_STATE_RUNNING, &sdata->state); return res; } static int ieee802154_check_mac_settings(struct ieee802154_local *local, struct ieee802154_sub_if_data *sdata, struct ieee802154_sub_if_data *nsdata) { struct wpan_dev *nwpan_dev = &nsdata->wpan_dev; struct wpan_dev *wpan_dev = &sdata->wpan_dev; ASSERT_RTNL(); if (sdata->iface_default_filtering != nsdata->iface_default_filtering) return -EBUSY; if (local->hw.flags & IEEE802154_HW_AFILT) { if (wpan_dev->pan_id != nwpan_dev->pan_id || wpan_dev->short_addr != nwpan_dev->short_addr || wpan_dev->extended_addr != nwpan_dev->extended_addr) return -EBUSY; } if (local->hw.flags & IEEE802154_HW_CSMA_PARAMS) { if (wpan_dev->min_be != nwpan_dev->min_be || wpan_dev->max_be != nwpan_dev->max_be || wpan_dev->csma_retries != nwpan_dev->csma_retries) return -EBUSY; } if (local->hw.flags & IEEE802154_HW_FRAME_RETRIES) { if (wpan_dev->frame_retries != nwpan_dev->frame_retries) return -EBUSY; } if (local->hw.flags & IEEE802154_HW_LBT) { if (wpan_dev->lbt != nwpan_dev->lbt) return -EBUSY; } return 0; } static int ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata, enum nl802154_iftype iftype) { struct ieee802154_local *local = sdata->local; struct ieee802154_sub_if_data *nsdata; /* we hold the RTNL here so can safely walk the list */ list_for_each_entry(nsdata, &local->interfaces, list) { if (nsdata != sdata && ieee802154_sdata_running(nsdata)) { int ret; /* TODO currently we don't support multiple node/coord * types we need to run skb_clone at rx path. Check if * there exist really an use case if we need to support * multiple node/coord types at the same time. */ if (sdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR && nsdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR) return -EBUSY; /* check all phy mac sublayer settings are the same. * We have only one phy, different values makes trouble. */ ret = ieee802154_check_mac_settings(local, sdata, nsdata); if (ret < 0) return ret; } } return 0; } static int mac802154_wpan_open(struct net_device *dev) { int rc; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; rc = ieee802154_check_concurrent_iface(sdata, wpan_dev->iftype); if (rc < 0) return rc; return mac802154_slave_open(dev); } static int mac802154_slave_close(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_local *local = sdata->local; ASSERT_RTNL(); if (mac802154_is_scanning(local)) mac802154_abort_scan_locked(local, sdata); if (mac802154_is_beaconing(local)) mac802154_stop_beacons_locked(local, sdata); netif_stop_queue(dev); local->open_count--; clear_bit(SDATA_STATE_RUNNING, &sdata->state); if (!local->open_count) ieee802154_stop_device(local); return 0; } static int mac802154_set_header_security(struct ieee802154_sub_if_data *sdata, struct ieee802154_hdr *hdr, const struct ieee802154_mac_cb *cb) { struct ieee802154_llsec_params params; u8 level; mac802154_llsec_get_params(&sdata->sec, &params); if (!params.enabled && cb->secen_override && cb->secen) return -EINVAL; if (!params.enabled || (cb->secen_override && !cb->secen) || !params.out_level) return 0; if (cb->seclevel_override && !cb->seclevel) return -EINVAL; level = cb->seclevel_override ? cb->seclevel : params.out_level; hdr->fc.security_enabled = 1; hdr->sec.level = level; hdr->sec.key_id_mode = params.out_key.mode; if (params.out_key.mode == IEEE802154_SCF_KEY_SHORT_INDEX) hdr->sec.short_src = params.out_key.short_source; else if (params.out_key.mode == IEEE802154_SCF_KEY_HW_INDEX) hdr->sec.extended_src = params.out_key.extended_source; hdr->sec.key_id = params.out_key.id; return 0; } static int ieee802154_header_create(struct sk_buff *skb, struct net_device *dev, const struct ieee802154_addr *daddr, const struct ieee802154_addr *saddr, unsigned len) { struct ieee802154_hdr hdr; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; struct ieee802154_mac_cb *cb = mac_cb(skb); int hlen; if (!daddr) return -EINVAL; memset(&hdr.fc, 0, sizeof(hdr.fc)); hdr.fc.type = cb->type; hdr.fc.security_enabled = cb->secen; hdr.fc.ack_request = cb->ackreq; hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF; if (mac802154_set_header_security(sdata, &hdr, cb) < 0) return -EINVAL; if (!saddr) { if (wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST) || wpan_dev->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) || wpan_dev->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST)) { hdr.source.mode = IEEE802154_ADDR_LONG; hdr.source.extended_addr = wpan_dev->extended_addr; } else { hdr.source.mode = IEEE802154_ADDR_SHORT; hdr.source.short_addr = wpan_dev->short_addr; } hdr.source.pan_id = wpan_dev->pan_id; } else { hdr.source = *(const struct ieee802154_addr *)saddr; } hdr.dest = *(const struct ieee802154_addr *)daddr; hlen = ieee802154_hdr_push(skb, &hdr); if (hlen < 0) return -EINVAL; skb_reset_mac_header(skb); skb->mac_len = hlen; if (len > ieee802154_max_payload(&hdr)) return -EMSGSIZE; return hlen; } static const struct wpan_dev_header_ops ieee802154_header_ops = { .create = ieee802154_header_create, }; /* This header create functionality assumes a 8 byte array for * source and destination pointer at maximum. To adapt this for * the 802.15.4 dataframe header we use extended address handling * here only and intra pan connection. fc fields are mostly fallback * handling. For provide dev_hard_header for dgram sockets. */ static int mac802154_header_create(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len) { struct ieee802154_hdr hdr; struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct wpan_dev *wpan_dev = &sdata->wpan_dev; struct ieee802154_mac_cb cb = { }; int hlen; if (!daddr) return -EINVAL; memset(&hdr.fc, 0, sizeof(hdr.fc)); hdr.fc.type = IEEE802154_FC_TYPE_DATA; hdr.fc.ack_request = wpan_dev->ackreq; hdr.seq = atomic_inc_return(&dev->ieee802154_ptr->dsn) & 0xFF; /* TODO currently a workaround to give zero cb block to set * security parameters defaults according MIB. */ if (mac802154_set_header_security(sdata, &hdr, &cb) < 0) return -EINVAL; hdr.dest.pan_id = wpan_dev->pan_id; hdr.dest.mode = IEEE802154_ADDR_LONG; ieee802154_be64_to_le64(&hdr.dest.extended_addr, daddr); hdr.source.pan_id = hdr.dest.pan_id; hdr.source.mode = IEEE802154_ADDR_LONG; if (!saddr) hdr.source.extended_addr = wpan_dev->extended_addr; else ieee802154_be64_to_le64(&hdr.source.extended_addr, saddr); hlen = ieee802154_hdr_push(skb, &hdr); if (hlen < 0) return -EINVAL; skb_reset_mac_header(skb); skb->mac_len = hlen; if (len > ieee802154_max_payload(&hdr)) return -EMSGSIZE; return hlen; } static int mac802154_header_parse(const struct sk_buff *skb, unsigned char *haddr) { struct ieee802154_hdr hdr; if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) { pr_debug("malformed packet\n"); return 0; } if (hdr.source.mode == IEEE802154_ADDR_LONG) { ieee802154_le64_to_be64(haddr, &hdr.source.extended_addr); return IEEE802154_EXTENDED_ADDR_LEN; } return 0; } static const struct header_ops mac802154_header_ops = { .create = mac802154_header_create, .parse = mac802154_header_parse, }; static const struct net_device_ops mac802154_wpan_ops = { .ndo_open = mac802154_wpan_open, .ndo_stop = mac802154_slave_close, .ndo_start_xmit = ieee802154_subif_start_xmit, .ndo_do_ioctl = mac802154_wpan_ioctl, .ndo_set_mac_address = mac802154_wpan_mac_addr, }; static const struct net_device_ops mac802154_monitor_ops = { .ndo_open = mac802154_wpan_open, .ndo_stop = mac802154_slave_close, .ndo_start_xmit = ieee802154_monitor_start_xmit, }; static void mac802154_wpan_free(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); mac802154_llsec_destroy(&sdata->sec); } static void ieee802154_if_setup(struct net_device *dev) { dev->addr_len = IEEE802154_EXTENDED_ADDR_LEN; memset(dev->broadcast, 0xff, IEEE802154_EXTENDED_ADDR_LEN); /* Let hard_header_len set to IEEE802154_MIN_HEADER_LEN. AF_PACKET * will not send frames without any payload, but ack frames * has no payload, so substract one that we can send a 3 bytes * frame. The xmit callback assumes at least a hard header where two * bytes fc and sequence field are set. */ dev->hard_header_len = IEEE802154_MIN_HEADER_LEN - 1; /* The auth_tag header is for security and places in private payload * room of mac frame which stucks between payload and FCS field. */ dev->needed_tailroom = IEEE802154_MAX_AUTH_TAG_LEN + IEEE802154_FCS_LEN; /* The mtu size is the payload without mac header in this case. * We have a dynamic length header with a minimum header length * which is hard_header_len. In this case we let mtu to the size * of maximum payload which is IEEE802154_MTU - IEEE802154_FCS_LEN - * hard_header_len. The FCS which is set by hardware or ndo_start_xmit * and the minimum mac header which can be evaluated inside driver * layer. The rest of mac header will be part of payload if greater * than hard_header_len. */ dev->mtu = IEEE802154_MTU - IEEE802154_FCS_LEN - dev->hard_header_len; dev->tx_queue_len = 300; dev->flags = IFF_NOARP | IFF_BROADCAST; } static int ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, enum nl802154_iftype type) { struct wpan_dev *wpan_dev = &sdata->wpan_dev; int ret; u8 tmp; /* set some type-dependent values */ sdata->wpan_dev.iftype = type; get_random_bytes(&tmp, sizeof(tmp)); atomic_set(&wpan_dev->bsn, tmp); get_random_bytes(&tmp, sizeof(tmp)); atomic_set(&wpan_dev->dsn, tmp); /* defaults per 802.15.4-2011 */ wpan_dev->min_be = 3; wpan_dev->max_be = 5; wpan_dev->csma_retries = 4; wpan_dev->frame_retries = 3; wpan_dev->pan_id = cpu_to_le16(IEEE802154_PANID_BROADCAST); wpan_dev->short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); switch (type) { case NL802154_IFTYPE_COORD: case NL802154_IFTYPE_NODE: ieee802154_be64_to_le64(&wpan_dev->extended_addr, sdata->dev->dev_addr); sdata->dev->header_ops = &mac802154_header_ops; sdata->dev->needs_free_netdev = true; sdata->dev->priv_destructor = mac802154_wpan_free; sdata->dev->netdev_ops = &mac802154_wpan_ops; sdata->dev->ml_priv = &mac802154_mlme_wpan; sdata->iface_default_filtering = IEEE802154_FILTERING_4_FRAME_FIELDS; wpan_dev->header_ops = &ieee802154_header_ops; mutex_init(&sdata->sec_mtx); mac802154_llsec_init(&sdata->sec); ret = mac802154_wpan_update_llsec(sdata->dev); if (ret < 0) return ret; break; case NL802154_IFTYPE_MONITOR: sdata->dev->needs_free_netdev = true; sdata->dev->netdev_ops = &mac802154_monitor_ops; sdata->iface_default_filtering = IEEE802154_FILTERING_NONE; break; default: BUG(); } return 0; } struct net_device * ieee802154_if_add(struct ieee802154_local *local, const char *name, unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr) { u8 addr[IEEE802154_EXTENDED_ADDR_LEN]; struct net_device *ndev = NULL; struct ieee802154_sub_if_data *sdata = NULL; int ret; ASSERT_RTNL(); ndev = alloc_netdev(sizeof(*sdata), name, name_assign_type, ieee802154_if_setup); if (!ndev) return ERR_PTR(-ENOMEM); ndev->needed_headroom = local->hw.extra_tx_headroom + IEEE802154_MAX_HEADER_LEN; ret = dev_alloc_name(ndev, ndev->name); if (ret < 0) goto err; ieee802154_le64_to_be64(ndev->perm_addr, &local->hw.phy->perm_extended_addr); switch (type) { case NL802154_IFTYPE_COORD: case NL802154_IFTYPE_NODE: ndev->type = ARPHRD_IEEE802154; if (ieee802154_is_valid_extended_unicast_addr(extended_addr)) { ieee802154_le64_to_be64(addr, &extended_addr); dev_addr_set(ndev, addr); } else { dev_addr_set(ndev, ndev->perm_addr); } break; case NL802154_IFTYPE_MONITOR: ndev->type = ARPHRD_IEEE802154_MONITOR; break; default: ret = -EINVAL; goto err; } /* TODO check this */ SET_NETDEV_DEV(ndev, &local->phy->dev); dev_net_set(ndev, wpan_phy_net(local->hw.phy)); sdata = netdev_priv(ndev); ndev->ieee802154_ptr = &sdata->wpan_dev; memcpy(sdata->name, ndev->name, IFNAMSIZ); sdata->dev = ndev; sdata->wpan_dev.wpan_phy = local->hw.phy; sdata->local = local; INIT_LIST_HEAD(&sdata->wpan_dev.list); /* setup type-dependent data */ ret = ieee802154_setup_sdata(sdata, type); if (ret) goto err; ret = register_netdevice(ndev); if (ret < 0) goto err; mutex_lock(&local->iflist_mtx); list_add_tail_rcu(&sdata->list, &local->interfaces); mutex_unlock(&local->iflist_mtx); return ndev; err: free_netdev(ndev); return ERR_PTR(ret); } void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata) { ASSERT_RTNL(); mutex_lock(&sdata->local->iflist_mtx); list_del_rcu(&sdata->list); mutex_unlock(&sdata->local->iflist_mtx); synchronize_rcu(); unregister_netdevice(sdata->dev); } void ieee802154_remove_interfaces(struct ieee802154_local *local) { struct ieee802154_sub_if_data *sdata, *tmp; mutex_lock(&local->iflist_mtx); list_for_each_entry_safe(sdata, tmp, &local->interfaces, list) { list_del(&sdata->list); unregister_netdevice(sdata->dev); } mutex_unlock(&local->iflist_mtx); } static int netdev_notify(struct notifier_block *nb, unsigned long state, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct ieee802154_sub_if_data *sdata; if (state != NETDEV_CHANGENAME) return NOTIFY_DONE; if (!dev->ieee802154_ptr || !dev->ieee802154_ptr->wpan_phy) return NOTIFY_DONE; if (dev->ieee802154_ptr->wpan_phy->privid != mac802154_wpan_phy_privid) return NOTIFY_DONE; sdata = IEEE802154_DEV_TO_SUB_IF(dev); memcpy(sdata->name, dev->name, IFNAMSIZ); return NOTIFY_OK; } static struct notifier_block mac802154_netdev_notifier = { .notifier_call = netdev_notify, }; int ieee802154_iface_init(void) { return register_netdevice_notifier(&mac802154_netdev_notifier); } void ieee802154_iface_exit(void) { unregister_netdevice_notifier(&mac802154_netdev_notifier); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_KDEV_T_H #define _LINUX_KDEV_T_H #include <uapi/linux/kdev_t.h> #define MINORBITS 20 #define MINORMASK ((1U << MINORBITS) - 1) #define MAJOR(dev) ((unsigned int) ((dev) >> MINORBITS)) #define MINOR(dev) ((unsigned int) ((dev) & MINORMASK)) #define MKDEV(ma,mi) (((ma) << MINORBITS) | (mi)) #define print_dev_t(buffer, dev) \ sprintf((buffer), "%u:%u\n", MAJOR(dev), MINOR(dev)) #define format_dev_t(buffer, dev) \ ({ \ sprintf(buffer, "%u:%u", MAJOR(dev), MINOR(dev)); \ buffer; \ }) /* acceptable for old filesystems */ static __always_inline bool old_valid_dev(dev_t dev) { return MAJOR(dev) < 256 && MINOR(dev) < 256; } static __always_inline u16 old_encode_dev(dev_t dev) { return (MAJOR(dev) << 8) | MINOR(dev); } static __always_inline dev_t old_decode_dev(u16 val) { return MKDEV((val >> 8) & 255, val & 255); } static __always_inline u32 new_encode_dev(dev_t dev) { unsigned major = MAJOR(dev); unsigned minor = MINOR(dev); return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12); } static __always_inline dev_t new_decode_dev(u32 dev) { unsigned major = (dev & 0xfff00) >> 8; unsigned minor = (dev & 0xff) | ((dev >> 12) & 0xfff00); return MKDEV(major, minor); } static __always_inline u64 huge_encode_dev(dev_t dev) { return new_encode_dev(dev); } static __always_inline dev_t huge_decode_dev(u64 dev) { return new_decode_dev(dev); } static __always_inline int sysv_valid_dev(dev_t dev) { return MAJOR(dev) < (1<<14) && MINOR(dev) < (1<<18); } static __always_inline u32 sysv_encode_dev(dev_t dev) { return MINOR(dev) | (MAJOR(dev) << 18); } static __always_inline unsigned sysv_major(u32 dev) { return (dev >> 18) & 0x3fff; } static __always_inline unsigned sysv_minor(u32 dev) { return dev & 0x3ffff; } #endif
4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2019 Facebook */ #include <linux/hash.h> #include <linux/bpf.h> #include <linux/filter.h> #include <linux/ftrace.h> #include <linux/rbtree_latch.h> #include <linux/perf_event.h> #include <linux/btf.h> #include <linux/rcupdate_trace.h> #include <linux/rcupdate_wait.h> #include <linux/static_call.h> #include <linux/bpf_verifier.h> #include <linux/bpf_lsm.h> #include <linux/delay.h> /* dummy _ops. The verifier will operate on target program's ops. */ const struct bpf_verifier_ops bpf_extension_verifier_ops = { }; const struct bpf_prog_ops bpf_extension_prog_ops = { }; /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ #define TRAMPOLINE_HASH_BITS 10 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd) { struct bpf_trampoline *tr = ops->private; int ret = 0; if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) { /* This is called inside register_ftrace_direct_multi(), so * tr->mutex is already locked. */ lockdep_assert_held_once(&tr->mutex); /* Instead of updating the trampoline here, we propagate * -EAGAIN to register_ftrace_direct(). Then we can * retry register_ftrace_direct() after updating the * trampoline. */ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) { if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY)) return -EBUSY; tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY; return -EAGAIN; } return 0; } /* The normal locking order is * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) * * The following two commands are called from * * prepare_direct_functions_for_ipmodify * cleanup_direct_functions_after_ipmodify * * In both cases, direct_mutex is already locked. Use * mutex_trylock(&tr->mutex) to avoid deadlock in race condition * (something else is making changes to this same trampoline). */ if (!mutex_trylock(&tr->mutex)) { /* sleep 1 ms to make sure whatever holding tr->mutex makes * some progress. */ msleep(1); return -EAGAIN; } switch (cmd) { case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER: tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY; if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); break; case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER: tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY; if (tr->flags & BPF_TRAMP_F_ORIG_STACK) ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */); break; default: ret = -EINVAL; break; } mutex_unlock(&tr->mutex); return ret; } #endif bool bpf_prog_has_trampoline(const struct bpf_prog *prog) { enum bpf_attach_type eatype = prog->expected_attach_type; enum bpf_prog_type ptype = prog->type; return (ptype == BPF_PROG_TYPE_TRACING && (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || eatype == BPF_MODIFY_RETURN)) || (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); } void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) { ksym->start = (unsigned long) data; ksym->end = ksym->start + PAGE_SIZE; bpf_ksym_add(ksym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, PAGE_SIZE, false, ksym->name); } void bpf_image_ksym_del(struct bpf_ksym *ksym) { bpf_ksym_del(ksym); perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, PAGE_SIZE, true, ksym->name); } static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; struct hlist_head *head; int i; mutex_lock(&trampoline_mutex); head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; hlist_for_each_entry(tr, head, hlist) { if (tr->key == key) { refcount_inc(&tr->refcnt); goto out; } } tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) goto out; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL); if (!tr->fops) { kfree(tr); tr = NULL; goto out; } tr->fops->private = tr; tr->fops->ops_func = bpf_tramp_ftrace_ops_func; #endif tr->key = key; INIT_HLIST_NODE(&tr->hlist); hlist_add_head(&tr->hlist, head); refcount_set(&tr->refcnt, 1); mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); out: mutex_unlock(&trampoline_mutex); return tr; } static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) { void *ip = tr->func.addr; int ret; if (tr->func.ftrace_managed) ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false); else ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); return ret; } static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr, bool lock_direct_mutex) { void *ip = tr->func.addr; int ret; if (tr->func.ftrace_managed) { if (lock_direct_mutex) ret = modify_ftrace_direct(tr->fops, (long)new_addr); else ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr); } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); } return ret; } /* first time registering */ static int register_fentry(struct bpf_trampoline *tr, void *new_addr) { void *ip = tr->func.addr; unsigned long faddr; int ret; faddr = ftrace_location((unsigned long)ip); if (faddr) { if (!tr->fops) return -ENOTSUPP; tr->func.ftrace_managed = true; } if (tr->func.ftrace_managed) { ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1); ret = register_ftrace_direct(tr->fops, (long)new_addr); } else { ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); } return ret; } static struct bpf_tramp_links * bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) { struct bpf_tramp_link *link; struct bpf_tramp_links *tlinks; struct bpf_tramp_link **links; int kind; *total = 0; tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); if (!tlinks) return ERR_PTR(-ENOMEM); for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { tlinks[kind].nr_links = tr->progs_cnt[kind]; *total += tr->progs_cnt[kind]; links = tlinks[kind].links; hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { *ip_arg |= link->link.prog->call_get_func_ip; *links++ = link; } } return tlinks; } static void bpf_tramp_image_free(struct bpf_tramp_image *im) { bpf_image_ksym_del(&im->ksym); bpf_jit_free_exec(im->image); bpf_jit_uncharge_modmem(PAGE_SIZE); percpu_ref_exit(&im->pcref); kfree_rcu(im, rcu); } static void __bpf_tramp_image_put_deferred(struct work_struct *work) { struct bpf_tramp_image *im; im = container_of(work, struct bpf_tramp_image, work); bpf_tramp_image_free(im); } /* callback, fexit step 3 or fentry step 2 */ static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu) { struct bpf_tramp_image *im; im = container_of(rcu, struct bpf_tramp_image, rcu); INIT_WORK(&im->work, __bpf_tramp_image_put_deferred); schedule_work(&im->work); } /* callback, fexit step 2. Called after percpu_ref_kill confirms. */ static void __bpf_tramp_image_release(struct percpu_ref *pcref) { struct bpf_tramp_image *im; im = container_of(pcref, struct bpf_tramp_image, pcref); call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); } /* callback, fexit or fentry step 1 */ static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu) { struct bpf_tramp_image *im; im = container_of(rcu, struct bpf_tramp_image, rcu); if (im->ip_after_call) /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */ percpu_ref_kill(&im->pcref); else /* the case of fentry trampoline */ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); } static void bpf_tramp_image_put(struct bpf_tramp_image *im) { /* The trampoline image that calls original function is using: * rcu_read_lock_trace to protect sleepable bpf progs * rcu_read_lock to protect normal bpf progs * percpu_ref to protect trampoline itself * rcu tasks to protect trampoline asm not covered by percpu_ref * (which are few asm insns before __bpf_tramp_enter and * after __bpf_tramp_exit) * * The trampoline is unreachable before bpf_tramp_image_put(). * * First, patch the trampoline to avoid calling into fexit progs. * The progs will be freed even if the original function is still * executing or sleeping. * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on * first few asm instructions to execute and call into * __bpf_tramp_enter->percpu_ref_get. * Then use percpu_ref_kill to wait for the trampoline and the original * function to finish. * Then use call_rcu_tasks() to make sure few asm insns in * the trampoline epilogue are done as well. * * In !PREEMPT case the task that got interrupted in the first asm * insns won't go through an RCU quiescent state which the * percpu_ref_kill will be waiting for. Hence the first * call_rcu_tasks() is not necessary. */ if (im->ip_after_call) { int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, NULL, im->ip_epilogue); WARN_ON(err); if (IS_ENABLED(CONFIG_PREEMPTION)) call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); else percpu_ref_kill(&im->pcref); return; } /* The trampoline without fexit and fmod_ret progs doesn't call original * function and doesn't use percpu_ref. * Use call_rcu_tasks_trace() to wait for sleepable progs to finish. * Then use call_rcu_tasks() to wait for the rest of trampoline asm * and normal progs. */ call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks); } static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key) { struct bpf_tramp_image *im; struct bpf_ksym *ksym; void *image; int err = -ENOMEM; im = kzalloc(sizeof(*im), GFP_KERNEL); if (!im) goto out; err = bpf_jit_charge_modmem(PAGE_SIZE); if (err) goto out_free_im; err = -ENOMEM; im->image = image = bpf_jit_alloc_exec(PAGE_SIZE); if (!image) goto out_uncharge; set_vm_flush_reset_perms(image); err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); if (err) goto out_free_image; ksym = &im->ksym; INIT_LIST_HEAD_RCU(&ksym->lnode); snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key); bpf_image_ksym_add(image, ksym); return im; out_free_image: bpf_jit_free_exec(im->image); out_uncharge: bpf_jit_uncharge_modmem(PAGE_SIZE); out_free_im: kfree(im); out: return ERR_PTR(err); } static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex) { struct bpf_tramp_image *im; struct bpf_tramp_links *tlinks; u32 orig_flags = tr->flags; bool ip_arg = false; int err, total; tlinks = bpf_trampoline_get_progs(tr, &total, &ip_arg); if (IS_ERR(tlinks)) return PTR_ERR(tlinks); if (total == 0) { err = unregister_fentry(tr, tr->cur_image->image); bpf_tramp_image_put(tr->cur_image); tr->cur_image = NULL; goto out; } im = bpf_tramp_image_alloc(tr->key); if (IS_ERR(im)) { err = PTR_ERR(im); goto out; } /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX); if (tlinks[BPF_TRAMP_FEXIT].nr_links || tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME * should not be set together. */ tr->flags |= BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; } else { tr->flags |= BPF_TRAMP_F_RESTORE_REGS; } if (ip_arg) tr->flags |= BPF_TRAMP_F_IP_ARG; #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS again: if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) && (tr->flags & BPF_TRAMP_F_CALL_ORIG)) tr->flags |= BPF_TRAMP_F_ORIG_STACK; #endif err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, &tr->func.model, tr->flags, tlinks, tr->func.addr); if (err < 0) goto out_free; set_memory_rox((long)im->image, 1); WARN_ON(tr->cur_image && total == 0); if (tr->cur_image) /* progs already running at this address */ err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex); else /* first time registering */ err = register_fentry(tr, im->image); #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if (err == -EAGAIN) { /* -EAGAIN from bpf_tramp_ftrace_ops_func. Now * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the * trampoline again, and retry register. */ /* reset fops->func and fops->trampoline for re-register */ tr->fops->func = NULL; tr->fops->trampoline = 0; /* reset im->image memory attr for arch_prepare_bpf_trampoline */ set_memory_nx((long)im->image, 1); set_memory_rw((long)im->image, 1); goto again; } #endif if (err) goto out_free; if (tr->cur_image) bpf_tramp_image_put(tr->cur_image); tr->cur_image = im; out: /* If any error happens, restore previous flags */ if (err) tr->flags = orig_flags; kfree(tlinks); return err; out_free: bpf_tramp_image_free(im); goto out; } static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) { switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: return BPF_TRAMP_FENTRY; case BPF_MODIFY_RETURN: return BPF_TRAMP_MODIFY_RETURN; case BPF_TRACE_FEXIT: return BPF_TRAMP_FEXIT; case BPF_LSM_MAC: if (!prog->aux->attach_func_proto->type) /* The function returns void, we cannot modify its * return value. */ return BPF_TRAMP_FEXIT; else return BPF_TRAMP_MODIFY_RETURN; default: return BPF_TRAMP_REPLACE; } } static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; struct bpf_tramp_link *link_exiting; int err = 0; int cnt = 0, i; kind = bpf_attach_type_to_tramp(link->link.prog); if (tr->extension_prog) /* cannot attach fentry/fexit if extension prog is attached. * cannot overwrite extension prog either. */ return -EBUSY; for (i = 0; i < BPF_TRAMP_MAX; i++) cnt += tr->progs_cnt[i]; if (kind == BPF_TRAMP_REPLACE) { /* Cannot attach extension if fentry/fexit are in use. */ if (cnt) return -EBUSY; tr->extension_prog = link->link.prog; return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, link->link.prog->bpf_func); } if (cnt >= BPF_MAX_TRAMP_LINKS) return -E2BIG; if (!hlist_unhashed(&link->tramp_hlist)) /* prog already linked */ return -EBUSY; hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { if (link_exiting->link.prog != link->link.prog) continue; /* prog already linked */ return -EBUSY; } hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]); tr->progs_cnt[kind]++; err = bpf_trampoline_update(tr, true /* lock_direct_mutex */); if (err) { hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; } return err; } int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { int err; mutex_lock(&tr->mutex); err = __bpf_trampoline_link_prog(link, tr); mutex_unlock(&tr->mutex); return err; } static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { enum bpf_tramp_prog_type kind; int err; kind = bpf_attach_type_to_tramp(link->link.prog); if (kind == BPF_TRAMP_REPLACE) { WARN_ON_ONCE(!tr->extension_prog); err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, tr->extension_prog->bpf_func, NULL); tr->extension_prog = NULL; return err; } hlist_del_init(&link->tramp_hlist); tr->progs_cnt[kind]--; return bpf_trampoline_update(tr, true /* lock_direct_mutex */); } /* bpf_trampoline_unlink_prog() should never fail. */ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) { int err; mutex_lock(&tr->mutex); err = __bpf_trampoline_unlink_prog(link, tr); mutex_unlock(&tr->mutex); return err; } #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) static void bpf_shim_tramp_link_release(struct bpf_link *link) { struct bpf_shim_tramp_link *shim_link = container_of(link, struct bpf_shim_tramp_link, link.link); /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */ if (!shim_link->trampoline) return; WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline)); bpf_trampoline_put(shim_link->trampoline); } static void bpf_shim_tramp_link_dealloc(struct bpf_link *link) { struct bpf_shim_tramp_link *shim_link = container_of(link, struct bpf_shim_tramp_link, link.link); kfree(shim_link); } static const struct bpf_link_ops bpf_shim_tramp_link_lops = { .release = bpf_shim_tramp_link_release, .dealloc = bpf_shim_tramp_link_dealloc, }; static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog, bpf_func_t bpf_func, int cgroup_atype) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_prog *p; shim_link = kzalloc(sizeof(*shim_link), GFP_USER); if (!shim_link) return NULL; p = bpf_prog_alloc(1, 0); if (!p) { kfree(shim_link); return NULL; } p->jited = false; p->bpf_func = bpf_func; p->aux->cgroup_atype = cgroup_atype; p->aux->attach_func_proto = prog->aux->attach_func_proto; p->aux->attach_btf_id = prog->aux->attach_btf_id; p->aux->attach_btf = prog->aux->attach_btf; btf_get(p->aux->attach_btf); p->type = BPF_PROG_TYPE_LSM; p->expected_attach_type = BPF_LSM_MAC; bpf_prog_inc(p); bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC, &bpf_shim_tramp_link_lops, p); bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype); return shim_link; } static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, bpf_func_t bpf_func) { struct bpf_tramp_link *link; int kind; for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { struct bpf_prog *p = link->link.prog; if (p->bpf_func == bpf_func) return container_of(link, struct bpf_shim_tramp_link, link); } } return NULL; } int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, int cgroup_atype) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_attach_target_info tgt_info = {}; struct bpf_trampoline *tr; bpf_func_t bpf_func; u64 key; int err; err = bpf_check_attach_target(NULL, prog, NULL, prog->aux->attach_btf_id, &tgt_info); if (err) return err; key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, prog->aux->attach_btf_id); bpf_lsm_find_cgroup_shim(prog, &bpf_func); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) return -ENOMEM; mutex_lock(&tr->mutex); shim_link = cgroup_shim_find(tr, bpf_func); if (shim_link) { /* Reusing existing shim attached by the other program. */ bpf_link_inc(&shim_link->link.link); mutex_unlock(&tr->mutex); bpf_trampoline_put(tr); /* bpf_trampoline_get above */ return 0; } /* Allocate and install new shim. */ shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype); if (!shim_link) { err = -ENOMEM; goto err; } err = __bpf_trampoline_link_prog(&shim_link->link, tr); if (err) goto err; shim_link->trampoline = tr; /* note, we're still holding tr refcnt from above */ mutex_unlock(&tr->mutex); return 0; err: mutex_unlock(&tr->mutex); if (shim_link) bpf_link_put(&shim_link->link.link); /* have to release tr while _not_ holding its mutex */ bpf_trampoline_put(tr); /* bpf_trampoline_get above */ return err; } void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) { struct bpf_shim_tramp_link *shim_link = NULL; struct bpf_trampoline *tr; bpf_func_t bpf_func; u64 key; key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf, prog->aux->attach_btf_id); bpf_lsm_find_cgroup_shim(prog, &bpf_func); tr = bpf_trampoline_lookup(key); if (WARN_ON_ONCE(!tr)) return; mutex_lock(&tr->mutex); shim_link = cgroup_shim_find(tr, bpf_func); mutex_unlock(&tr->mutex); if (shim_link) bpf_link_put(&shim_link->link.link); bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */ } #endif struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info) { struct bpf_trampoline *tr; tr = bpf_trampoline_lookup(key); if (!tr) return NULL; mutex_lock(&tr->mutex); if (tr->func.addr) goto out; memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel)); tr->func.addr = (void *)tgt_info->tgt_addr; out: mutex_unlock(&tr->mutex); return tr; } void bpf_trampoline_put(struct bpf_trampoline *tr) { int i; if (!tr) return; mutex_lock(&trampoline_mutex); if (!refcount_dec_and_test(&tr->refcnt)) goto out; WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); for (i = 0; i < BPF_TRAMP_MAX; i++) if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i]))) goto out; /* This code will be executed even when the last bpf_tramp_image * is alive. All progs are detached from the trampoline and the * trampoline image is patched with jmp into epilogue to skip * fexit progs. The fentry-only trampoline will be freed via * multiple rcu callbacks. */ hlist_del(&tr->hlist); if (tr->fops) { ftrace_free_filter(tr->fops); kfree(tr->fops); } kfree(tr); out: mutex_unlock(&trampoline_mutex); } #define NO_START_TIME 1 static __always_inline u64 notrace bpf_prog_start_time(void) { u64 start = NO_START_TIME; if (static_branch_unlikely(&bpf_stats_enabled_key)) { start = sched_clock(); if (unlikely(!start)) start = NO_START_TIME; } return start; } /* The logic is similar to bpf_prog_run(), but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into * call __bpf_prog_enter * call prog->bpf_func * call __bpf_prog_exit * * __bpf_prog_enter returns: * 0 - skip execution of the bpf prog * 1 - execute bpf prog * [2..MAX_U64] - execute bpf prog and record execution time. * This is start time. */ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { rcu_read_lock(); migrate_disable(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); return 0; } return bpf_prog_start_time(); } static void notrace update_prog_stats(struct bpf_prog *prog, u64 start) { struct bpf_prog_stats *stats; if (static_branch_unlikely(&bpf_stats_enabled_key) && /* static_key could be enabled in __bpf_prog_enter* * and disabled in __bpf_prog_exit*. * And vice versa. * Hence check that 'start' is valid. */ start > NO_START_TIME) { unsigned long flags; stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); u64_stats_add(&stats->nsecs, sched_clock() - start); u64_stats_update_end_irqrestore(&stats->syncp, flags); } } static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock(); } static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { /* Runtime stats are exported via actual BPF_LSM_CGROUP * programs, not the shims. */ rcu_read_lock(); migrate_disable(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); return NO_START_TIME; } static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); migrate_enable(); rcu_read_unlock(); } u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); migrate_disable(); might_fault(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { bpf_prog_inc_misses_counter(prog); return 0; } return bpf_prog_start_time(); } void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock_trace(); } static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); migrate_disable(); might_fault(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); return bpf_prog_start_time(); } static void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); migrate_enable(); rcu_read_unlock_trace(); } static u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { rcu_read_lock(); migrate_disable(); run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); return bpf_prog_start_time(); } static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); update_prog_stats(prog, start); migrate_enable(); rcu_read_unlock(); } void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) { percpu_ref_get(&tr->pcref); } void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) { percpu_ref_put(&tr->pcref); } bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog) { bool sleepable = prog->aux->sleepable; if (bpf_prog_check_recur(prog)) return sleepable ? __bpf_prog_enter_sleepable_recur : __bpf_prog_enter_recur; if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM && prog->expected_attach_type == BPF_LSM_CGROUP) return __bpf_prog_enter_lsm_cgroup; return sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter; } bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog) { bool sleepable = prog->aux->sleepable; if (bpf_prog_check_recur(prog)) return sleepable ? __bpf_prog_exit_sleepable_recur : __bpf_prog_exit_recur; if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM && prog->expected_attach_type == BPF_LSM_CGROUP) return __bpf_prog_exit_lsm_cgroup; return sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit; } int __weak arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *orig_call) { return -ENOTSUPP; } static int __init init_trampolines(void) { int i; for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) INIT_HLIST_HEAD(&trampoline_table[i]); return 0; } late_initcall(init_trampolines);
6 224 6441 7721 7690 7690 374 375 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _MM_PERCPU_INTERNAL_H #define _MM_PERCPU_INTERNAL_H #include <linux/types.h> #include <linux/percpu.h> #include <linux/memcontrol.h> /* * pcpu_block_md is the metadata block struct. * Each chunk's bitmap is split into a number of full blocks. * All units are in terms of bits. * * The scan hint is the largest known contiguous area before the contig hint. * It is not necessarily the actual largest contig hint though. There is an * invariant that the scan_hint_start > contig_hint_start iff * scan_hint == contig_hint. This is necessary because when scanning forward, * we don't know if a new contig hint would be better than the current one. */ struct pcpu_block_md { int scan_hint; /* scan hint for block */ int scan_hint_start; /* block relative starting position of the scan hint */ int contig_hint; /* contig hint for block */ int contig_hint_start; /* block relative starting position of the contig hint */ int left_free; /* size of free space along the left side of the block */ int right_free; /* size of free space along the right side of the block */ int first_free; /* block position of first free */ int nr_bits; /* total bits responsible for */ }; struct pcpu_chunk { #ifdef CONFIG_PERCPU_STATS int nr_alloc; /* # of allocations */ size_t max_alloc_size; /* largest allocation size */ #endif struct list_head list; /* linked to pcpu_slot lists */ int free_bytes; /* free bytes in the chunk */ struct pcpu_block_md chunk_md; unsigned long *bound_map; /* boundary map */ /* * base_addr is the base address of this chunk. * To reduce false sharing, current layout is optimized to make sure * base_addr locate in the different cacheline with free_bytes and * chunk_md. */ void *base_addr ____cacheline_aligned_in_smp; unsigned long *alloc_map; /* allocation map */ struct pcpu_block_md *md_blocks; /* metadata blocks */ void *data; /* chunk data */ bool immutable; /* no [de]population allowed */ bool isolated; /* isolated from active chunk slots */ int start_offset; /* the overlap with the previous region to have a page aligned base_addr */ int end_offset; /* additional area required to have the region end page aligned */ #ifdef CONFIG_MEMCG_KMEM struct obj_cgroup **obj_cgroups; /* vector of object cgroups */ #endif int nr_pages; /* # of pages served by this chunk */ int nr_populated; /* # of populated pages */ int nr_empty_pop_pages; /* # of empty populated pages */ unsigned long populated[]; /* populated bitmap */ }; extern spinlock_t pcpu_lock; extern struct list_head *pcpu_chunk_lists; extern int pcpu_nr_slots; extern int pcpu_sidelined_slot; extern int pcpu_to_depopulate_slot; extern int pcpu_nr_empty_pop_pages; extern struct pcpu_chunk *pcpu_first_chunk; extern struct pcpu_chunk *pcpu_reserved_chunk; /** * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks * @chunk: chunk of interest * * This conversion is from the number of physical pages that the chunk * serves to the number of bitmap blocks used. */ static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk) { return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE; } /** * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap * @pages: number of physical pages * * This conversion is from physical pages to the number of bits * required in the bitmap. */ static inline int pcpu_nr_pages_to_map_bits(int pages) { return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; } /** * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap * @chunk: chunk of interest * * This conversion is from the number of physical pages that the chunk * serves to the number of bits in the bitmap. */ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) { return pcpu_nr_pages_to_map_bits(chunk->nr_pages); } /** * pcpu_obj_full_size - helper to calculate size of each accounted object * @size: size of area to allocate in bytes * * For each accounted object there is an extra space which is used to store * obj_cgroup membership if kmemcg is not disabled. Charge it too. */ static inline size_t pcpu_obj_full_size(size_t size) { size_t extra_size = 0; #ifdef CONFIG_MEMCG_KMEM if (!mem_cgroup_kmem_disabled()) extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); #endif return size * num_possible_cpus() + extra_size; } #ifdef CONFIG_PERCPU_STATS #include <linux/spinlock.h> struct percpu_stats { u64 nr_alloc; /* lifetime # of allocations */ u64 nr_dealloc; /* lifetime # of deallocations */ u64 nr_cur_alloc; /* current # of allocations */ u64 nr_max_alloc; /* max # of live allocations */ u32 nr_chunks; /* current # of live chunks */ u32 nr_max_chunks; /* max # of live chunks */ size_t min_alloc_size; /* min allocation size */ size_t max_alloc_size; /* max allocation size */ }; extern struct percpu_stats pcpu_stats; extern struct pcpu_alloc_info pcpu_stats_ai; /* * For debug purposes. We don't care about the flexible array. */ static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) { memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info)); /* initialize min_alloc_size to unit_size */ pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size; } /* * pcpu_stats_area_alloc - increment area allocation stats * @chunk: the location of the area being allocated * @size: size of area to allocate in bytes * * CONTEXT: * pcpu_lock. */ static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) { lockdep_assert_held(&pcpu_lock); pcpu_stats.nr_alloc++; pcpu_stats.nr_cur_alloc++; pcpu_stats.nr_max_alloc = max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc); pcpu_stats.min_alloc_size = min(pcpu_stats.min_alloc_size, size); pcpu_stats.max_alloc_size = max(pcpu_stats.max_alloc_size, size); chunk->nr_alloc++; chunk->max_alloc_size = max(chunk->max_alloc_size, size); } /* * pcpu_stats_area_dealloc - decrement allocation stats * @chunk: the location of the area being deallocated * * CONTEXT: * pcpu_lock. */ static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) { lockdep_assert_held(&pcpu_lock); pcpu_stats.nr_dealloc++; pcpu_stats.nr_cur_alloc--; chunk->nr_alloc--; } /* * pcpu_stats_chunk_alloc - increment chunk stats */ static inline void pcpu_stats_chunk_alloc(void) { unsigned long flags; spin_lock_irqsave(&pcpu_lock, flags); pcpu_stats.nr_chunks++; pcpu_stats.nr_max_chunks = max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks); spin_unlock_irqrestore(&pcpu_lock, flags); } /* * pcpu_stats_chunk_dealloc - decrement chunk stats */ static inline void pcpu_stats_chunk_dealloc(void) { unsigned long flags; spin_lock_irqsave(&pcpu_lock, flags); pcpu_stats.nr_chunks--; spin_unlock_irqrestore(&pcpu_lock, flags); } #else static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai) { } static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size) { } static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk) { } static inline void pcpu_stats_chunk_alloc(void) { } static inline void pcpu_stats_chunk_dealloc(void) { } #endif /* !CONFIG_PERCPU_STATS */ #endif
14 14 11129 11036 581 1272 1272 92 92 92 91 92 12 12 11042 1048 14 5222 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include <uapi/linux/sched/types.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> #include <linux/sched/isolation.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ char *full_name; int (*threadfn)(void *data); void *data; int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; struct completion *done; struct list_head list; }; struct kthread { unsigned long flags; unsigned int cpu; int result; int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif /* To store the full name if task comm is truncated. */ char *full_name; }; enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, }; static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); return k->worker_private; } /* * Variant of to_kthread() that doesn't assume @p is a kthread. * * Per construction; when: * * (p->flags & PF_KTHREAD) && p->worker_private * * the task is both a kthread and struct kthread is persistent. However * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and * begin_new_exec()). */ static inline struct kthread *__to_kthread(struct task_struct *p) { void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; } void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); if (!kthread || !kthread->full_name) { __get_task_comm(buf, buf_size, tsk); return; } strscpy_pad(buf, kthread->full_name, buf_size); } bool set_kthread_struct(struct task_struct *p) { struct kthread *kthread; if (WARN_ON_ONCE(to_kthread(p))) return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); if (!kthread) return false; init_completion(&kthread->exited); init_completion(&kthread->parked); p->vfork_done = &kthread->exited; p->worker_private = kthread; return true; } void free_kthread_struct(struct task_struct *k) { struct kthread *kthread; /* * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); if (!kthread) return; #ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread->blkcg_css); #endif k->worker_private = NULL; kfree(kthread->full_name); kfree(kthread); } /** * kthread_should_stop - should this kthread return now? * * When someone calls kthread_stop() on your kthread, it will be woken * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ bool kthread_should_stop(void) { return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); static bool __kthread_should_park(struct task_struct *k) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); } /** * kthread_should_park - should this kthread park now? * * When someone calls kthread_park() on your kthread, it will be woken * and this will return true. You should then do the necessary * cleanup and call kthread_parkme() * * Similar to kthread_should_stop(), but this keeps the thread alive * and in a park position. kthread_unpark() "restarts" the thread and * calls the thread function again. */ bool kthread_should_park(void) { return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { struct kthread *kthread = __to_kthread(current); if (!kthread) return false; return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); } /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * * kthread_should_stop() for freezable kthreads, which will enter * refrigerator if necessary. This function is safe from kthread_stop() / * freezer deadlock and freezable kthreads should use this function instead * of calling try_to_freeze() directly. */ bool kthread_freezable_should_stop(bool *was_frozen) { bool frozen = false; might_sleep(); if (unlikely(freezing(current))) frozen = __refrigerator(true); if (was_frozen) *was_frozen = frozen; return kthread_should_stop(); } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); /** * kthread_func - return the function specified on kthread creation * @task: kthread task in question * * Returns NULL if the task is not a kthread. */ void *kthread_func(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); if (kthread) return kthread->threadfn; return NULL; } EXPORT_SYMBOL_GPL(kthread_func); /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * * Return the data value specified when kthread @task was created. * The caller is responsible for ensuring the validity of @task when * calling this function. */ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it * was created if accessible. If @task isn't a kthread task or its data is * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); void *data = NULL; if (kthread) copy_from_kernel_nofault(&data, &kthread->data, sizeof(data)); return data; } static void __kthread_parkme(struct kthread *self) { for (;;) { /* * TASK_PARKED is a special state; we must serialize against * possible pending wakeups to avoid store-store collisions on * task->state. * * Such a collision might possibly result in the task state * changin from TASK_PARKED and us failing the * wait_task_inactive() in kthread_park(). */ set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; /* * Thread is going to call schedule(), do not preempt it, * or the caller of kthread_park() may spend more time in * wait_task_inactive(). */ preempt_disable(); complete(&self->parked); schedule_preempt_disabled(); preempt_enable(); } __set_current_state(TASK_RUNNING); } void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } EXPORT_SYMBOL_GPL(kthread_parkme); /** * kthread_exit - Cause the current kthread return @result to kthread_stop(). * @result: The integer value to return to kthread_stop(). * * While kthread_exit can be called directly, it exists so that * functions which do some additional work in non-modular code such as * module_put_and_kthread_exit can be implemented. * * Does not return. */ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; do_exit(0); } /** * kthread_complete_and_exit - Exit the current kthread. * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of * @comp can use this function to exit safely. * * Does not return. */ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); kthread_exit(code); } EXPORT_SYMBOL(kthread_complete_and_exit); static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; struct kthread *self; int ret; self = to_kthread(current); /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create->full_name); kfree(create); kthread_exit(-EINTR); } self->full_name = create->full_name; self->threadfn = threadfn; self->data = data; /* * The new thread inherited kthreadd's priority and CPU mask. Reset * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, &param); set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD)); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; /* * Thread is going to call schedule(), do not preempt it, * or the creator may spend more time in wait_task_inactive(). */ preempt_disable(); complete(done); schedule_preempt_disabled(); preempt_enable(); ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ int tsk_fork_get_node(struct task_struct *tsk) { #ifdef CONFIG_NUMA if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) { int pid; #ifdef CONFIG_NUMA current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, create->full_name, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); kfree(create->full_name); if (!done) { kfree(create); return; } create->result = ERR_PTR(pid); complete(done); } } static __printf(4, 0) struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), GFP_KERNEL); if (!create) return ERR_PTR(-ENOMEM); create->threadfn = threadfn; create->data = data; create->node = node; create->done = &done; create->full_name = kvasprintf(GFP_KERNEL, namefmt, args); if (!create->full_name) { task = ERR_PTR(-ENOMEM); goto free_create; } spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); /* * Wait for completion in killable state, for I might be chosen by * the OOM killer while kthreadd is trying to allocate memory for * new kernel thread. */ if (unlikely(wait_for_completion_killable(&done))) { /* * If I was killed by a fatal signal before kthreadd (or new * kernel thread) calls complete(), leave the cleanup of this * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. */ wait_for_completion(&done); } task = create->result; free_create: kfree(create); return task; } /** * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either return directly if it is a * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...) { struct task_struct *task; va_list args; va_start(args, namefmt); task = __kthread_create_on_node(threadfn, data, node, namefmt, args); va_end(args); return task; } EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { unsigned long flags; if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) { __kthread_bind_mask(p, cpumask_of(cpu), state); } void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); } /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), * except that @cpu doesn't need to be online, and the thread must be * stopped (i.e., just returned from kthread_create()). */ void kthread_bind(struct task_struct *p, unsigned int cpu) { __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(kthread_bind); /** * kthread_create_on_cpu - Create a cpu bound kthread * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @cpu: The cpu on which the thread should be bound, * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt) { struct task_struct *p; p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; kthread_bind(p, cpu); /* CPU hotplug need to bind once again when unparking the thread. */ to_kthread(p)->cpu = cpu; return p; } EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { struct kthread *kthread = to_kthread(k); if (!kthread) return; WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); if (cpu < 0) { clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); return; } kthread->cpu = cpu; set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } bool kthread_is_per_cpu(struct task_struct *p) { struct kthread *kthread = __to_kthread(p); if (!kthread) return false; return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return false, wakes it, and * waits for it to return. If the thread is marked percpu then its * bound to the cpu again. */ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return true, wakes it, and * waits for it to return. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will park without * calling threadfn(). * * Returns 0 if the thread is parked, -ENOSYS if the thread exited. * If called by the kthread itself just the park bit is set. */ int kthread_park(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) return -EBUSY; set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); /* * Wait for __kthread_parkme() to complete(), this means we * _will_ have TASK_PARKED and are about to call schedule(). */ wait_for_completion(&kthread->parked); /* * Now wait for that schedule() to complete and the task to * get scheduled out. */ WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; } EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and * waits for it to exit. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. */ int kthread_stop(struct task_struct *k) { struct kthread *kthread; int ret; trace_sched_kthread_stop(k); get_task_struct(k); kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); int kthreadd(void *unused) { struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) schedule(); __set_current_state(TASK_RUNNING); spin_lock(&kthread_create_lock); while (!list_empty(&kthread_create_list)) { struct kthread_create_info *create; create = list_entry(kthread_create_list.next, struct kthread_create_info, list); list_del_init(&create->list); spin_unlock(&kthread_create_lock); create_kthread(create); spin_lock(&kthread_create_lock); } spin_unlock(&kthread_create_lock); } return 0; } void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); } EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * * This function implements the main cycle of kthread worker. It processes * work_list until it is stopped with kthread_stop(). It sleeps when the queue * is empty. * * The works are not allowed to keep any locks, disable preemption or interrupts * when they finish. There is defined a safe point for freezing when one work * finishes and before a new one is started. * * Also the works must not be handled by more than one worker at the same time, * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; /* * FIXME: Update the check and remove the assignment when all kthread * worker users are created using kthread_create_worker*() functions. */ WARN_ON(worker->task && worker->task != current); worker->task = current; if (worker->flags & KTW_FREEZABLE) set_freezable(); repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&worker->lock); worker->task = NULL; raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; raw_spin_unlock_irq(&worker->lock); if (work) { kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); trace_sched_kthread_work_execute_start(work); work->func(work); /* * Avoid dereferencing work after this point. The trace * event only cares about the address. */ trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) schedule(); try_to_freeze(); cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * __kthread_create_worker(int cpu, unsigned int flags, const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) return ERR_PTR(-ENOMEM); kthread_init_worker(worker); if (cpu >= 0) node = cpu_to_node(cpu); task = __kthread_create_on_node(kthread_worker_fn, worker, node, namefmt, args); if (IS_ERR(task)) goto fail_task; if (cpu >= 0) kthread_bind(task, cpu); worker->flags = flags; worker->task = task; wake_up_process(task); return worker; fail_task: kfree(worker); return ERR_CAST(task); } /** * kthread_create_worker - create a kthread worker * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker(unsigned int flags, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker(-1, flags, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the kthread worker (task). * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. * * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * * CPU hotplug: * The kthread worker API is simple and generic. It just provides a way * to create, use, and destroy workers. * * It is up to the API user how to handle CPU hotplug. They have to decide * how to handle pending work items, prevent queuing new ones, and * restore the functionality when the CPU goes off and on. There are a * few catches: * * - CPU affinity gets lost when it is scheduled on an offline CPU. * * - The worker might not exist when the CPU was off when the user * created the workers. * * Good practice is to implement two CPU hotplug callbacks and to * destroy/create the worker when the CPU goes down/up. * * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker(cpu, flags, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_cpu); /* * Returns true when the work could not be queued at the moment. * It happens when it is already pending in a worker list * or when it is being cancelled. */ static inline bool queuing_blocked(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); return !list_empty(&work->node) || work->canceling; } static void kthread_insert_work_sanity_check(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); WARN_ON_ONCE(!list_empty(&work->node)); /* Do not use a work with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker && work->worker != worker); } /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, struct list_head *pos) { kthread_insert_work_sanity_check(worker, work); trace_sched_kthread_work_queue_work(worker, work); list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } /** * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task * must have been created with kthread_worker_create(). Returns %true * if @work was successfully queued, %false if it was already pending. * * Reinitialize the work if it needs to be used by another worker. * For example, when the worker was stopped and started again. */ bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; unsigned long flags; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ void kthread_delayed_work_timer_fn(struct timer_list *t) { struct kthread_delayed_work *dwork = from_timer(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; /* * This might happen when a pending work is reinitialized. * It means that it is used a wrong way. */ if (WARN_ON_ONCE(!worker)) return; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); if (!work->canceling) kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { kthread_insert_work(worker, work, &worker->work_list); return; } /* Be paranoid and try to detect possible races already now. */ kthread_insert_work_sanity_check(worker, work); list_add(&work->node, &worker->delayed_work_list); work->worker = worker; timer->expires = jiffies + delay; add_timer(timer); } /** * kthread_queue_delayed_work - queue the associated kthread work * after a delay. * @worker: target kthread_worker * @dwork: kthread_delayed_work to queue * @delay: number of jiffies to wait before queuing * * If the work has not been pending it starts a timer that will queue * the work after the given @delay. If @delay is zero, it queues the * work immediately. * * Return: %false if the @work has already been pending. It means that * either the timer was running or the work was queued. It returns %true * otherwise. */ bool kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); struct kthread_flush_work { struct kthread_work work; struct completion done; }; static void kthread_flush_work_fn(struct kthread_work *work) { struct kthread_flush_work *fwork = container_of(work, struct kthread_flush_work, work); complete(&fwork->done); } /** * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; struct kthread_worker *worker; bool noop = false; worker = work->worker; if (!worker) return; raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) kthread_insert_work(worker, &fwork.work, worker->work_list.next); else noop = true; raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_work); /* * Make sure that the timer is neither set nor running and could * not manipulate the work list_head any longer. * * The function is called under worker->lock. The lock is temporary * released but the timer can't be set again in the meantime. */ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, unsigned long *flags) { struct kthread_delayed_work *dwork = container_of(work, struct kthread_delayed_work, work); struct kthread_worker *worker = work->worker; /* * del_timer_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); del_timer_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } /* * This function removes the work from the worker queue. * * It is called under worker->lock. The caller must make sure that * the timer used by delayed work is not running, e.g. by calling * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. * * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ static bool __kthread_cancel_work(struct kthread_work *work) { /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. */ if (!list_empty(&work->node)) { list_del_init(&work->node); return true; } return false; } /** * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work * @worker: kthread worker to use * @dwork: kthread delayed work to queue * @delay: number of jiffies to wait before queuing * * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is zero, * @work is guaranteed to be queued immediately. * * Return: %false if @dwork was idle and queued, %true otherwise. * * A special case is when the work is being canceled in parallel. * It might be caused either by the real kthread_cancel_delayed_work_sync() * or yet another kthread_mod_delayed_work() call. We let the other command * win and return %true here. The return value can be used for reference * counting and the number of queued works stays the same. Anyway, the caller * is supposed to synchronize these operations a reasonable way. * * This function is safe to call from any context including IRQ handler. * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() * for details. */ bool kthread_mod_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; int ret; raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) { ret = false; goto fast_queue; } /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); /* * Temporary cancel the work but do not fight with another command * that is canceling the work as well. * * It is a bit tricky because of possible races with another * mod_delayed_work() and cancel_delayed_work() callers. * * The timer must be canceled first because worker->lock is released * when doing so. But the work can be removed from the queue (list) * only when it can be queued again so that the return value can * be used for reference counting. */ kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) { /* The number of works in the queue does not change. */ ret = true; goto out; } ret = __kthread_cancel_work(work); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) { struct kthread_worker *worker = work->worker; unsigned long flags; int ret = false; if (!worker) goto out; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (is_dwork) kthread_cancel_delayed_work_timer(work, &flags); ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; /* * The work is in progress and we need to wait with the lock released. * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } /** * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish * @work: the kthread work to cancel * * Cancel @work and wait for its execution to finish. This function * can be used even if the work re-queues itself. On return from this * function, @work is guaranteed to be not pending or executing on any CPU. * * kthread_cancel_work_sync(&delayed_work->work) must not be used for * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. * * The caller must ensure that the worker on which @work was last * queued can't be destroyed before this function returns. * * Return: %true if @work was pending, %false otherwise. */ bool kthread_cancel_work_sync(struct kthread_work *work) { return __kthread_cancel_work_sync(work, false); } EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); /** * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and * wait for it to finish. * @dwork: the kthread delayed work to cancel * * This is kthread_cancel_work_sync() for delayed works. * * Return: %true if @dwork was pending, %false otherwise. */ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) { return __kthread_cancel_work_sync(&dwork->work, true); } EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); /** * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_worker); /** * kthread_destroy_worker - destroy a kthread worker * @worker: worker to be destroyed * * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. * * Note that this function is not responsible for handling delayed work, so * caller should be responsible for queuing or canceling all delayed work items * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { struct task_struct *task; task = worker->task; if (WARN_ON(!task)) return; kthread_flush_worker(worker); kthread_stop(task); WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); /** * kthread_use_mm - make the calling kthread operate on an address space * @mm: address space to operate on */ void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); /* * It is possible for mm to be the same as tsk->active_mm, but * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), * because these references are not equivalent. */ mmgrab(mm); task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); #endif /* * When a kthread starts operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by * mmdrop_lazy_tlb(). */ mmdrop_lazy_tlb(active_mm); } EXPORT_SYMBOL_GPL(kthread_use_mm); /** * kthread_unuse_mm - reverse the effect of kthread_use_mm() * @mm: address space to operate on */ void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); task_lock(tsk); /* * When a kthread stops operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after accessing user-space memory, before * clearing tsk->mm. */ smp_mb__after_spinlock(); sync_mm_rss(mm); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info * * Current thread must be a kthread. The thread is running jobs on behalf of * other threads. In some cases, we expect the jobs attach cgroup info of * original threads instead of that of current thread. This function stores * original thread's cgroup info in current kthread context for later * retrieval. */ void kthread_associate_blkcg(struct cgroup_subsys_state *css) { struct kthread *kthread; if (!(current->flags & PF_KTHREAD)) return; kthread = to_kthread(current); if (!kthread) return; if (kthread->blkcg_css) { css_put(kthread->blkcg_css); kthread->blkcg_css = NULL; } if (css) { css_get(css); kthread->blkcg_css = css; } } EXPORT_SYMBOL(kthread_associate_blkcg); /** * kthread_blkcg - get associated blkcg css of current kthread * * Current thread must be a kthread. */ struct cgroup_subsys_state *kthread_blkcg(void) { struct kthread *kthread; if (current->flags & PF_KTHREAD) { kthread = to_kthread(current); if (kthread) return kthread->blkcg_css; } return NULL; } #endif
1271 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGETLB_INLINE_H #define _LINUX_HUGETLB_INLINE_H #ifdef CONFIG_HUGETLB_PAGE #include <linux/mm.h> static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { return !!(vma->vm_flags & VM_HUGETLB); } #else static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { return false; } #endif #endif
479 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner */ #include "bat_v.h" #include "main.h" #include <linux/atomic.h> #include <linux/cache.h> #include <linux/errno.h> #include <linux/if_ether.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/limits.h> #include <linux/list.h> #include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "bat_v_elp.h" #include "bat_v_ogm.h" #include "gateway_client.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "originator.h" static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if) { batadv_v_elp_iface_activate(primary_if, hard_iface); batadv_hardif_put(primary_if); } /* B.A.T.M.A.N. V does not use any queuing mechanism, therefore it can * set the interface as ACTIVE right away, without any risk of race * condition */ if (hard_iface->if_status == BATADV_IF_TO_BE_ACTIVATED) hard_iface->if_status = BATADV_IF_ACTIVE; } static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface) { int ret; ret = batadv_v_elp_iface_enable(hard_iface); if (ret < 0) return ret; ret = batadv_v_ogm_iface_enable(hard_iface); if (ret < 0) batadv_v_elp_iface_disable(hard_iface); return ret; } static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface) { batadv_v_ogm_iface_disable(hard_iface); batadv_v_elp_iface_disable(hard_iface); } static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface) { batadv_v_elp_primary_iface_set(hard_iface); batadv_v_ogm_primary_iface_set(hard_iface); } /** * batadv_v_iface_update_mac() - react to hard-interface MAC address change * @hard_iface: the modified interface * * If the modified interface is the primary one, update the originator * address in the ELP and OGM messages to reflect the new MAC address. */ static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if != hard_iface) goto out; batadv_v_primary_iface_set(hard_iface); out: batadv_hardif_put(primary_if); } static void batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) { ewma_throughput_init(&hardif_neigh->bat_v.throughput); INIT_WORK(&hardif_neigh->bat_v.metric_work, batadv_v_elp_throughput_metric_update); } /** * batadv_v_neigh_dump_neigh() - Dump a neighbour into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @hardif_neigh: Neighbour to dump * * Return: Error code, or 0 on success */ static int batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_hardif_neigh_node *hardif_neigh) { void *hdr; unsigned int last_seen_msecs; u32 throughput; last_seen_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen); throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput); throughput = throughput * 100; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_NEIGHBORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, hardif_neigh->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, hardif_neigh->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, hardif_neigh->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs) || nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_v_neigh_dump_hardif() - Dump the neighbours of a hard interface into * a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @hard_iface: The hard interface to be dumped * @idx_s: Entries to be skipped * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_v_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface, int *idx_s) { struct batadv_hardif_neigh_node *hardif_neigh; int idx = 0; hlist_for_each_entry_rcu(hardif_neigh, &hard_iface->neigh_list, list) { if (idx++ < *idx_s) continue; if (batadv_v_neigh_dump_neigh(msg, portid, seq, hardif_neigh)) { *idx_s = idx - 1; return -EMSGSIZE; } } *idx_s = 0; return 0; } /** * batadv_v_neigh_dump() - Dump the neighbours of a hard interface into a * message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @single_hardif: Limit dumping to this hard interface */ static void batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *single_hardif) { struct batadv_hard_iface *hard_iface; int i_hardif = 0; int i_hardif_s = cb->args[0]; int idx = cb->args[1]; int portid = NETLINK_CB(cb->skb).portid; rcu_read_lock(); if (single_hardif) { if (i_hardif_s == 0) { if (batadv_v_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, single_hardif, &idx) == 0) i_hardif++; } } else { list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (i_hardif++ < i_hardif_s) continue; if (batadv_v_neigh_dump_hardif(msg, portid, cb->nlh->nlmsg_seq, bat_priv, hard_iface, &idx)) { i_hardif--; break; } } } rcu_read_unlock(); cb->args[0] = i_hardif; cb->args[1] = idx; } /** * batadv_v_orig_dump_subentry() - Dump an originator subentry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @neigh_node: Single hops neighbour * @best: Is the best originator * * Return: Error code, or 0 on success */ static int batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node, bool best) { struct batadv_neigh_ifinfo *n_ifinfo; unsigned int last_seen_msecs; u32 throughput; void *hdr; n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); if (!n_ifinfo) return 0; throughput = n_ifinfo->bat_v.throughput * 100; batadv_neigh_ifinfo_put(n_ifinfo); last_seen_msecs = jiffies_to_msecs(jiffies - orig_node->last_seen); if (if_outgoing != BATADV_IF_DEFAULT && if_outgoing != neigh_node->if_incoming) return 0; hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_ORIGINATORS); if (!hdr) return -ENOBUFS; if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) || nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN, neigh_node->addr) || nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, neigh_node->if_incoming->net_dev->name) || nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, neigh_node->if_incoming->net_dev->ifindex) || nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, last_seen_msecs)) goto nla_put_failure; if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** * batadv_v_orig_dump_entry() - Dump an originator entry into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @sub_s: Number of sub entries to skip * * This function assumes the caller holds rcu_read_lock(). * * Return: Error code, or 0 on success */ static int batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct batadv_orig_node *orig_node, int *sub_s) { struct batadv_neigh_node *neigh_node_best; struct batadv_neigh_node *neigh_node; int sub = 0; bool best; neigh_node_best = batadv_orig_router_get(orig_node, if_outgoing); if (!neigh_node_best) goto out; hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) { if (sub++ < *sub_s) continue; best = (neigh_node == neigh_node_best); if (batadv_v_orig_dump_subentry(msg, portid, seq, bat_priv, if_outgoing, orig_node, neigh_node, best)) { batadv_neigh_node_put(neigh_node_best); *sub_s = sub - 1; return -EMSGSIZE; } } out: batadv_neigh_node_put(neigh_node_best); *sub_s = 0; return 0; } /** * batadv_v_orig_dump_bucket() - Dump an originator bucket into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @head: Bucket to be dumped * @idx_s: Number of entries to be skipped * @sub: Number of sub entries to be skipped * * Return: Error code, or 0 on success */ static int batadv_v_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing, struct hlist_head *head, int *idx_s, int *sub) { struct batadv_orig_node *orig_node; int idx = 0; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { if (idx++ < *idx_s) continue; if (batadv_v_orig_dump_entry(msg, portid, seq, bat_priv, if_outgoing, orig_node, sub)) { rcu_read_unlock(); *idx_s = idx - 1; return -EMSGSIZE; } } rcu_read_unlock(); *idx_s = 0; *sub = 0; return 0; } /** * batadv_v_orig_dump() - Dump the originators into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @if_outgoing: Limit dump to entries with this outgoing interface */ static void batadv_v_orig_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing) { struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; int bucket = cb->args[0]; int idx = cb->args[1]; int sub = cb->args[2]; int portid = NETLINK_CB(cb->skb).portid; while (bucket < hash->size) { head = &hash->table[bucket]; if (batadv_v_orig_dump_bucket(msg, portid, cb->nlh->nlmsg_seq, bat_priv, if_outgoing, head, &idx, &sub)) break; bucket++; } cb->args[0] = bucket; cb->args[1] = idx; cb->args[2] = sub; } static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2; int ret = 0; ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); if (!ifinfo1) goto err_ifinfo1; ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); if (!ifinfo2) goto err_ifinfo2; ret = ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput; batadv_neigh_ifinfo_put(ifinfo2); err_ifinfo2: batadv_neigh_ifinfo_put(ifinfo1); err_ifinfo1: return ret; } static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing1, struct batadv_neigh_node *neigh2, struct batadv_hard_iface *if_outgoing2) { struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2; u32 threshold; bool ret = false; ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); if (!ifinfo1) goto err_ifinfo1; ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); if (!ifinfo2) goto err_ifinfo2; threshold = ifinfo1->bat_v.throughput / 4; threshold = ifinfo1->bat_v.throughput - threshold; ret = ifinfo2->bat_v.throughput > threshold; batadv_neigh_ifinfo_put(ifinfo2); err_ifinfo2: batadv_neigh_ifinfo_put(ifinfo1); err_ifinfo1: return ret; } /** * batadv_v_init_sel_class() - initialize GW selection class * @bat_priv: the bat priv with all the soft interface information */ static void batadv_v_init_sel_class(struct batadv_priv *bat_priv) { /* set default throughput difference threshold to 5Mbps */ atomic_set(&bat_priv->gw.sel_class, 50); } /** * batadv_v_gw_throughput_get() - retrieve the GW-bandwidth for a given GW * @gw_node: the GW to retrieve the metric for * @bw: the pointer where the metric will be stored. The metric is computed as * the minimum between the GW advertised throughput and the path throughput to * it in the mesh * * Return: 0 on success, -1 on failure */ static int batadv_v_gw_throughput_get(struct batadv_gw_node *gw_node, u32 *bw) { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_orig_node *orig_node; struct batadv_neigh_node *router; int ret = -1; orig_node = gw_node->orig_node; router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router) goto out; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto out; /* the GW metric is computed as the minimum between the path throughput * to reach the GW itself and the advertised bandwidth. * This gives us an approximation of the effective throughput that the * client can expect via this particular GW node */ *bw = router_ifinfo->bat_v.throughput; *bw = min_t(u32, *bw, gw_node->bandwidth_down); ret = 0; out: batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(router_ifinfo); return ret; } /** * batadv_v_gw_get_best_gw_node() - retrieve the best GW node * @bat_priv: the bat priv with all the soft interface information * * Return: the GW node having the best GW-metric, NULL if no GW is known */ static struct batadv_gw_node * batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node, *curr_gw = NULL; u32 max_bw = 0, bw; rcu_read_lock(); hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) { if (!kref_get_unless_zero(&gw_node->refcount)) continue; if (batadv_v_gw_throughput_get(gw_node, &bw) < 0) goto next; if (curr_gw && bw <= max_bw) goto next; batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); max_bw = bw; next: batadv_gw_node_put(gw_node); } rcu_read_unlock(); return curr_gw; } /** * batadv_v_gw_is_eligible() - check if a originator would be selected as GW * @bat_priv: the bat priv with all the soft interface information * @curr_gw_orig: originator representing the currently selected GW * @orig_node: the originator representing the new candidate * * Return: true if orig_node can be selected as current GW, false otherwise */ static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv, struct batadv_orig_node *curr_gw_orig, struct batadv_orig_node *orig_node) { struct batadv_gw_node *curr_gw, *orig_gw = NULL; u32 gw_throughput, orig_throughput, threshold; bool ret = false; threshold = atomic_read(&bat_priv->gw.sel_class); curr_gw = batadv_gw_node_get(bat_priv, curr_gw_orig); if (!curr_gw) { ret = true; goto out; } if (batadv_v_gw_throughput_get(curr_gw, &gw_throughput) < 0) { ret = true; goto out; } orig_gw = batadv_gw_node_get(bat_priv, orig_node); if (!orig_gw) goto out; if (batadv_v_gw_throughput_get(orig_gw, &orig_throughput) < 0) goto out; if (orig_throughput < gw_throughput) goto out; if ((orig_throughput - gw_throughput) < threshold) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Restarting gateway selection: better gateway found (throughput curr: %u, throughput new: %u)\n", gw_throughput, orig_throughput); ret = true; out: batadv_gw_node_put(curr_gw); batadv_gw_node_put(orig_gw); return ret; } /** * batadv_v_gw_dump_entry() - Dump a gateway into a message * @msg: Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success */ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, struct batadv_gw_node *gw_node) { struct batadv_neigh_ifinfo *router_ifinfo = NULL; struct batadv_neigh_node *router; struct batadv_gw_node *curr_gw = NULL; int ret = 0; void *hdr; router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT); if (!router) goto out; router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) goto out; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS); if (!hdr) { ret = -ENOBUFS; goto out; } genl_dump_check_consistent(cb, hdr); ret = -EMSGSIZE; if (curr_gw == gw_node) { if (nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) { genlmsg_cancel(msg, hdr); goto out; } } if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, gw_node->orig_node->orig)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, router_ifinfo->bat_v.throughput)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put(msg, BATADV_ATTR_ROUTER, ETH_ALEN, router->addr)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_string(msg, BATADV_ATTR_HARD_IFNAME, router->if_incoming->net_dev->name)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, router->if_incoming->net_dev->ifindex)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN, gw_node->bandwidth_down)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP, gw_node->bandwidth_up)) { genlmsg_cancel(msg, hdr); goto out; } genlmsg_end(msg, hdr); ret = 0; out: batadv_gw_node_put(curr_gw); batadv_neigh_ifinfo_put(router_ifinfo); batadv_neigh_node_put(router); return ret; } /** * batadv_v_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * @bat_priv: The bat priv with all the soft interface information */ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv) { int portid = NETLINK_CB(cb->skb).portid; struct batadv_gw_node *gw_node; int idx_skip = cb->args[0]; int idx = 0; spin_lock_bh(&bat_priv->gw.list_lock); cb->seq = bat_priv->gw.generation << 1 | 1; hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) { if (idx++ < idx_skip) continue; if (batadv_v_gw_dump_entry(msg, portid, cb, bat_priv, gw_node)) { idx_skip = idx - 1; goto unlock; } } idx_skip = idx; unlock: spin_unlock_bh(&bat_priv->gw.list_lock); cb->args[0] = idx_skip; } static struct batadv_algo_ops batadv_batman_v __read_mostly = { .name = "BATMAN_V", .iface = { .activate = batadv_v_iface_activate, .enable = batadv_v_iface_enable, .disable = batadv_v_iface_disable, .update_mac = batadv_v_iface_update_mac, .primary_set = batadv_v_primary_iface_set, }, .neigh = { .hardif_init = batadv_v_hardif_neigh_init, .cmp = batadv_v_neigh_cmp, .is_similar_or_better = batadv_v_neigh_is_sob, .dump = batadv_v_neigh_dump, }, .orig = { .dump = batadv_v_orig_dump, }, .gw = { .init_sel_class = batadv_v_init_sel_class, .sel_class_max = U32_MAX, .get_best_gw_node = batadv_v_gw_get_best_gw_node, .is_eligible = batadv_v_gw_is_eligible, .dump = batadv_v_gw_dump, }, }; /** * batadv_v_hardif_init() - initialize the algorithm specific fields in the * hard-interface object * @hard_iface: the hard-interface to initialize */ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface) { /* enable link throughput auto-detection by setting the throughput * override to zero */ atomic_set(&hard_iface->bat_v.throughput_override, 0); atomic_set(&hard_iface->bat_v.elp_interval, 500); hard_iface->bat_v.aggr_len = 0; skb_queue_head_init(&hard_iface->bat_v.aggr_list); INIT_DELAYED_WORK(&hard_iface->bat_v.aggr_wq, batadv_v_ogm_aggr_work); } /** * batadv_v_mesh_init() - initialize the B.A.T.M.A.N. V private resources for a * mesh * @bat_priv: the object representing the mesh interface to initialise * * Return: 0 on success or a negative error code otherwise */ int batadv_v_mesh_init(struct batadv_priv *bat_priv) { int ret = 0; ret = batadv_v_ogm_init(bat_priv); if (ret < 0) return ret; return 0; } /** * batadv_v_mesh_free() - free the B.A.T.M.A.N. V private resources for a mesh * @bat_priv: the object representing the mesh interface to free */ void batadv_v_mesh_free(struct batadv_priv *bat_priv) { batadv_v_ogm_free(bat_priv); } /** * batadv_v_init() - B.A.T.M.A.N. V initialization function * * Description: Takes care of initializing all the subcomponents. * It is invoked upon module load only. * * Return: 0 on success or a negative error code otherwise */ int __init batadv_v_init(void) { int ret; /* B.A.T.M.A.N. V echo location protocol packet */ ret = batadv_recv_handler_register(BATADV_ELP, batadv_v_elp_packet_recv); if (ret < 0) return ret; ret = batadv_recv_handler_register(BATADV_OGM2, batadv_v_ogm_packet_recv); if (ret < 0) goto elp_unregister; ret = batadv_algo_register(&batadv_batman_v); if (ret < 0) goto ogm_unregister; return ret; ogm_unregister: batadv_recv_handler_unregister(BATADV_OGM2); elp_unregister: batadv_recv_handler_unregister(BATADV_ELP); return ret; }
3 15 15 15 15 18 18 18 18 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 // SPDX-License-Identifier: GPL-2.0-or-later /* * Spanning tree protocol; interface code * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/switchdev.h> #include "br_private.h" #include "br_private_stp.h" /* Port id is composed of priority and port number. * NB: some bits of priority are dropped to * make room for more ports. */ static inline port_id br_make_port_id(__u8 priority, __u16 port_no) { return ((u16)priority << BR_PORT_BITS) | (port_no & ((1<<BR_PORT_BITS)-1)); } #define BR_MAX_PORT_PRIORITY ((u16)~0 >> BR_PORT_BITS) /* called under bridge lock */ void br_init_port(struct net_bridge_port *p) { int err; p->port_id = br_make_port_id(p->priority, p->port_no); br_become_designated_port(p); br_set_state(p, BR_STATE_BLOCKING); p->topology_change_ack = 0; p->config_pending = 0; err = __set_ageing_time(p->dev, p->br->ageing_time); if (err) netdev_err(p->dev, "failed to offload ageing time\n"); } /* NO locks held */ void br_stp_enable_bridge(struct net_bridge *br) { struct net_bridge_port *p; spin_lock_bh(&br->lock); if (br->stp_enabled == BR_KERNEL_STP) mod_timer(&br->hello_timer, jiffies + br->hello_time); mod_delayed_work(system_long_wq, &br->gc_work, HZ / 10); br_config_bpdu_generation(br); list_for_each_entry(p, &br->port_list, list) { if (netif_running(p->dev) && netif_oper_up(p->dev)) br_stp_enable_port(p); } spin_unlock_bh(&br->lock); } /* NO locks held */ void br_stp_disable_bridge(struct net_bridge *br) { struct net_bridge_port *p; spin_lock_bh(&br->lock); list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED) br_stp_disable_port(p); } __br_set_topology_change(br, 0); br->topology_change_detected = 0; spin_unlock_bh(&br->lock); del_timer_sync(&br->hello_timer); del_timer_sync(&br->topology_change_timer); del_timer_sync(&br->tcn_timer); cancel_delayed_work_sync(&br->gc_work); } /* called under bridge lock */ void br_stp_enable_port(struct net_bridge_port *p) { br_init_port(p); br_port_state_selection(p->br); br_ifinfo_notify(RTM_NEWLINK, NULL, p); } /* called under bridge lock */ void br_stp_disable_port(struct net_bridge_port *p) { struct net_bridge *br = p->br; int wasroot; wasroot = br_is_root_bridge(br); br_become_designated_port(p); br_set_state(p, BR_STATE_DISABLED); p->topology_change_ack = 0; p->config_pending = 0; br_ifinfo_notify(RTM_NEWLINK, NULL, p); del_timer(&p->message_age_timer); del_timer(&p->forward_delay_timer); del_timer(&p->hold_timer); if (!rcu_access_pointer(p->backup_port)) br_fdb_delete_by_port(br, p, 0, 0); br_multicast_disable_port(p); br_configuration_update(br); br_port_state_selection(br); if (br_is_root_bridge(br) && !wasroot) br_become_root_bridge(br); } static int br_stp_call_user(struct net_bridge *br, char *arg) { char *argv[] = { BR_STP_PROG, br->dev->name, arg, NULL }; char *envp[] = { NULL }; int rc; /* call userspace STP and report program errors */ rc = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); if (rc > 0) { if (rc & 0xff) br_debug(br, BR_STP_PROG " received signal %d\n", rc & 0x7f); else br_debug(br, BR_STP_PROG " exited with code %d\n", (rc >> 8) & 0xff); } return rc; } static void br_stp_start(struct net_bridge *br) { int err = -ENOENT; if (net_eq(dev_net(br->dev), &init_net)) err = br_stp_call_user(br, "start"); if (err && err != -ENOENT) br_err(br, "failed to start userspace STP (%d)\n", err); spin_lock_bh(&br->lock); if (br->bridge_forward_delay < BR_MIN_FORWARD_DELAY) __br_set_forward_delay(br, BR_MIN_FORWARD_DELAY); else if (br->bridge_forward_delay > BR_MAX_FORWARD_DELAY) __br_set_forward_delay(br, BR_MAX_FORWARD_DELAY); if (!err) { br->stp_enabled = BR_USER_STP; br_debug(br, "userspace STP started\n"); } else { br->stp_enabled = BR_KERNEL_STP; br_debug(br, "using kernel STP\n"); /* To start timers on any ports left in blocking */ if (br->dev->flags & IFF_UP) mod_timer(&br->hello_timer, jiffies + br->hello_time); br_port_state_selection(br); } spin_unlock_bh(&br->lock); } static void br_stp_stop(struct net_bridge *br) { int err; if (br->stp_enabled == BR_USER_STP) { err = br_stp_call_user(br, "stop"); if (err) br_err(br, "failed to stop userspace STP (%d)\n", err); /* To start timers on any ports left in blocking */ spin_lock_bh(&br->lock); br_port_state_selection(br); spin_unlock_bh(&br->lock); } br->stp_enabled = BR_NO_STP; } int br_stp_set_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { ASSERT_RTNL(); if (br_mrp_enabled(br)) { NL_SET_ERR_MSG_MOD(extack, "STP can't be enabled if MRP is already enabled"); return -EINVAL; } if (val) { if (br->stp_enabled == BR_NO_STP) br_stp_start(br); } else { if (br->stp_enabled != BR_NO_STP) br_stp_stop(br); } return 0; } /* called under bridge lock */ void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) { /* should be aligned on 2 bytes for ether_addr_equal() */ unsigned short oldaddr_aligned[ETH_ALEN >> 1]; unsigned char *oldaddr = (unsigned char *)oldaddr_aligned; struct net_bridge_port *p; int wasroot; wasroot = br_is_root_bridge(br); br_fdb_change_mac_address(br, addr); memcpy(oldaddr, br->bridge_id.addr, ETH_ALEN); memcpy(br->bridge_id.addr, addr, ETH_ALEN); eth_hw_addr_set(br->dev, addr); list_for_each_entry(p, &br->port_list, list) { if (ether_addr_equal(p->designated_bridge.addr, oldaddr)) memcpy(p->designated_bridge.addr, addr, ETH_ALEN); if (ether_addr_equal(p->designated_root.addr, oldaddr)) memcpy(p->designated_root.addr, addr, ETH_ALEN); } br_configuration_update(br); br_port_state_selection(br); if (br_is_root_bridge(br) && !wasroot) br_become_root_bridge(br); } /* should be aligned on 2 bytes for ether_addr_equal() */ static const unsigned short br_mac_zero_aligned[ETH_ALEN >> 1]; /* called under bridge lock */ bool br_stp_recalculate_bridge_id(struct net_bridge *br) { const unsigned char *br_mac_zero = (const unsigned char *)br_mac_zero_aligned; const unsigned char *addr = br_mac_zero; struct net_bridge_port *p; /* user has chosen a value so keep it */ if (br->dev->addr_assign_type == NET_ADDR_SET) return false; list_for_each_entry(p, &br->port_list, list) { if (addr == br_mac_zero || memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0) addr = p->dev->dev_addr; } if (ether_addr_equal(br->bridge_id.addr, addr)) return false; /* no change */ br_stp_change_bridge_id(br, addr); return true; } /* Acquires and releases bridge lock */ void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio) { struct net_bridge_port *p; int wasroot; spin_lock_bh(&br->lock); wasroot = br_is_root_bridge(br); list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED && br_is_designated_port(p)) { p->designated_bridge.prio[0] = (newprio >> 8) & 0xFF; p->designated_bridge.prio[1] = newprio & 0xFF; } } br->bridge_id.prio[0] = (newprio >> 8) & 0xFF; br->bridge_id.prio[1] = newprio & 0xFF; br_configuration_update(br); br_port_state_selection(br); if (br_is_root_bridge(br) && !wasroot) br_become_root_bridge(br); spin_unlock_bh(&br->lock); } /* called under bridge lock */ int br_stp_set_port_priority(struct net_bridge_port *p, unsigned long newprio) { port_id new_port_id; if (newprio > BR_MAX_PORT_PRIORITY) return -ERANGE; new_port_id = br_make_port_id(newprio, p->port_no); if (br_is_designated_port(p)) p->designated_port = new_port_id; p->port_id = new_port_id; p->priority = newprio; if (!memcmp(&p->br->bridge_id, &p->designated_bridge, 8) && p->port_id < p->designated_port) { br_become_designated_port(p); br_port_state_selection(p->br); } return 0; } /* called under bridge lock */ int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost) { if (path_cost < BR_MIN_PATH_COST || path_cost > BR_MAX_PATH_COST) return -ERANGE; p->flags |= BR_ADMIN_COST; p->path_cost = path_cost; br_configuration_update(p->br); br_port_state_selection(p->br); return 0; } ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id) { return sprintf(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n", id->prio[0], id->prio[1], id->addr[0], id->addr[1], id->addr[2], id->addr[3], id->addr[4], id->addr[5]); }
9 9 9 9 9 9 1 1 1 5 2 2 1 1 1 1 1 1 2 9 9 9 1 1 1 1 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 // SPDX-License-Identifier: GPL-2.0-or-later /* -*- linux-c -*- * INET 802.1Q VLAN * Ethernet-type device handling. * * Authors: Ben Greear <greearb@candelatech.com> * Please send support related email to: netdev@vger.kernel.org * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html * * Fixes: Mar 22 2001: Martin Bokaemper <mbokaemper@unispherenetworks.com> * - reset skb->pkt_type on incoming packets when MAC was changed * - see that changed MAC is saddr for outgoing packets * Oct 20, 2001: Ard van Breeman: * - Fix MC-list, finally. * - Flush MC-list on VLAN destroy. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/net_tstamp.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/phy.h> #include <net/arp.h> #include <net/macsec.h> #include "vlan.h" #include "vlanproc.h" #include <linux/if_vlan.h> #include <linux/netpoll.h> /* * Create the VLAN header for an arbitrary protocol layer * * saddr=NULL means use device source address * daddr=NULL means leave destination address (eg unresolved arp) * * This is called when the SKB is moving down the stack towards the * physical devices. */ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_hdr *vhdr; unsigned int vhdrlen = 0; u16 vlan_tci = 0; int rc; if (!(vlan->flags & VLAN_FLAG_REORDER_HDR)) { vhdr = skb_push(skb, VLAN_HLEN); vlan_tci = vlan->vlan_id; vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority); vhdr->h_vlan_TCI = htons(vlan_tci); /* * Set the protocol type. For a packet of type ETH_P_802_3/2 we * put the length in here instead. */ if (type != ETH_P_802_3 && type != ETH_P_802_2) vhdr->h_vlan_encapsulated_proto = htons(type); else vhdr->h_vlan_encapsulated_proto = htons(len); skb->protocol = vlan->vlan_proto; type = ntohs(vlan->vlan_proto); vhdrlen = VLAN_HLEN; } /* Before delegating work to the lower layer, enter our MAC-address */ if (saddr == NULL) saddr = dev->dev_addr; /* Now make the underlying real hard header */ dev = vlan->real_dev; rc = dev_hard_header(skb, dev, type, daddr, saddr, len + vhdrlen); if (rc > 0) rc += vhdrlen; return rc; } static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); return NETDEV_TX_OK; #endif } static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); unsigned int len; int ret; /* Handle non-VLAN frames if they are sent to us, for example by DHCP. * * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs... */ if (vlan->flags & VLAN_FLAG_REORDER_HDR || veth->h_vlan_proto != vlan->vlan_proto) { u16 vlan_tci; vlan_tci = vlan->vlan_id; vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority); __vlan_hwaccel_put_tag(skb, vlan->vlan_proto, vlan_tci); } skb->dev = vlan->real_dev; len = skb->len; if (unlikely(netpoll_tx_running(dev))) return vlan_netpoll_send_skb(vlan, skb); ret = dev_queue_xmit(skb); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct vlan_pcpu_stats *stats; stats = this_cpu_ptr(vlan->vlan_pcpu_stats); u64_stats_update_begin(&stats->syncp); u64_stats_inc(&stats->tx_packets); u64_stats_add(&stats->tx_bytes, len); u64_stats_update_end(&stats->syncp); } else { this_cpu_inc(vlan->vlan_pcpu_stats->tx_dropped); } return ret; } static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; unsigned int max_mtu = real_dev->mtu; if (netif_reduces_vlan_mtu(real_dev)) max_mtu -= VLAN_HLEN; if (max_mtu < new_mtu) return -ERANGE; dev->mtu = new_mtu; return 0; } void vlan_dev_set_ingress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); if (vlan->ingress_priority_map[vlan_prio & 0x7] && !skb_prio) vlan->nr_ingress_mappings--; else if (!vlan->ingress_priority_map[vlan_prio & 0x7] && skb_prio) vlan->nr_ingress_mappings++; vlan->ingress_priority_map[vlan_prio & 0x7] = skb_prio; } int vlan_dev_set_egress_priority(const struct net_device *dev, u32 skb_prio, u16 vlan_prio) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_priority_tci_mapping *mp = NULL; struct vlan_priority_tci_mapping *np; u32 vlan_qos = (vlan_prio << VLAN_PRIO_SHIFT) & VLAN_PRIO_MASK; /* See if a priority mapping exists.. */ mp = vlan->egress_priority_map[skb_prio & 0xF]; while (mp) { if (mp->priority == skb_prio) { if (mp->vlan_qos && !vlan_qos) vlan->nr_egress_mappings--; else if (!mp->vlan_qos && vlan_qos) vlan->nr_egress_mappings++; mp->vlan_qos = vlan_qos; return 0; } mp = mp->next; } /* Create a new mapping then. */ mp = vlan->egress_priority_map[skb_prio & 0xF]; np = kmalloc(sizeof(struct vlan_priority_tci_mapping), GFP_KERNEL); if (!np) return -ENOBUFS; np->next = mp; np->priority = skb_prio; np->vlan_qos = vlan_qos; /* Before inserting this element in hash table, make sure all its fields * are committed to memory. * coupled with smp_rmb() in vlan_dev_get_egress_qos_mask() */ smp_wmb(); vlan->egress_priority_map[skb_prio & 0xF] = np; if (vlan_qos) vlan->nr_egress_mappings++; return 0; } /* Flags are defined in the vlan_flags enum in * include/uapi/linux/if_vlan.h file. */ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); u32 old_flags = vlan->flags; if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP | VLAN_FLAG_BRIDGE_BINDING)) return -EINVAL; vlan->flags = (old_flags & ~mask) | (flags & mask); if (netif_running(dev) && (vlan->flags ^ old_flags) & VLAN_FLAG_GVRP) { if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_join(dev); else vlan_gvrp_request_leave(dev); } if (netif_running(dev) && (vlan->flags ^ old_flags) & VLAN_FLAG_MVRP) { if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_join(dev); else vlan_mvrp_request_leave(dev); } return 0; } void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, size_t size) { strscpy_pad(result, vlan_dev_priv(dev)->real_dev->name, size); } bool vlan_dev_inherit_address(struct net_device *dev, struct net_device *real_dev) { if (dev->addr_assign_type != NET_ADDR_STOLEN) return false; eth_hw_addr_set(dev, real_dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return true; } static int vlan_dev_open(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; int err; if (!(real_dev->flags & IFF_UP) && !(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) return -ENETDOWN; if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr) && !vlan_dev_inherit_address(dev, real_dev)) { err = dev_uc_add(real_dev, dev->dev_addr); if (err < 0) goto out; } if (dev->flags & IFF_ALLMULTI) { err = dev_set_allmulti(real_dev, 1); if (err < 0) goto del_unicast; } if (dev->flags & IFF_PROMISC) { err = dev_set_promiscuity(real_dev, 1); if (err < 0) goto clear_allmulti; } ether_addr_copy(vlan->real_dev_addr, real_dev->dev_addr); if (vlan->flags & VLAN_FLAG_GVRP) vlan_gvrp_request_join(dev); if (vlan->flags & VLAN_FLAG_MVRP) vlan_mvrp_request_join(dev); if (netif_carrier_ok(real_dev) && !(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_carrier_on(dev); return 0; clear_allmulti: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(real_dev, -1); del_unicast: if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) dev_uc_del(real_dev, dev->dev_addr); out: netif_carrier_off(dev); return err; } static int vlan_dev_stop(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; dev_mc_unsync(real_dev, dev); dev_uc_unsync(real_dev, dev); if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(real_dev, -1); if (dev->flags & IFF_PROMISC) dev_set_promiscuity(real_dev, -1); if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) dev_uc_del(real_dev, dev->dev_addr); if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING)) netif_carrier_off(dev); return 0; } static int vlan_dev_set_mac_address(struct net_device *dev, void *p) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; struct sockaddr *addr = p; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; if (!(dev->flags & IFF_UP)) goto out; if (!ether_addr_equal(addr->sa_data, real_dev->dev_addr)) { err = dev_uc_add(real_dev, addr->sa_data); if (err < 0) return err; } if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) dev_uc_del(real_dev, dev->dev_addr); out: eth_hw_addr_set(dev, addr->sa_data); return 0; } static int vlan_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; return generic_hwtstamp_get_lower(real_dev, cfg); } static int vlan_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; if (!net_eq(dev_net(dev), dev_net(real_dev))) return -EOPNOTSUPP; return generic_hwtstamp_set_lower(real_dev, cfg, extack); } static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; struct ifreq ifrr; int err = -EOPNOTSUPP; strscpy_pad(ifrr.ifr_name, real_dev->name, IFNAMSIZ); ifrr.ifr_ifru = ifr->ifr_ifru; switch (cmd) { case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: if (netif_device_present(real_dev) && ops->ndo_eth_ioctl) err = ops->ndo_eth_ioctl(real_dev, &ifrr, cmd); break; } if (!err) ifr->ifr_ifru = ifrr.ifr_ifru; return err; } static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int err = 0; if (netif_device_present(real_dev) && ops->ndo_neigh_setup) err = ops->ndo_neigh_setup(real_dev, pa); return err; } #if IS_ENABLED(CONFIG_FCOE) static int vlan_dev_fcoe_ddp_setup(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = 0; if (ops->ndo_fcoe_ddp_setup) rc = ops->ndo_fcoe_ddp_setup(real_dev, xid, sgl, sgc); return rc; } static int vlan_dev_fcoe_ddp_done(struct net_device *dev, u16 xid) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int len = 0; if (ops->ndo_fcoe_ddp_done) len = ops->ndo_fcoe_ddp_done(real_dev, xid); return len; } static int vlan_dev_fcoe_enable(struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = -EINVAL; if (ops->ndo_fcoe_enable) rc = ops->ndo_fcoe_enable(real_dev); return rc; } static int vlan_dev_fcoe_disable(struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = -EINVAL; if (ops->ndo_fcoe_disable) rc = ops->ndo_fcoe_disable(real_dev); return rc; } static int vlan_dev_fcoe_ddp_target(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = 0; if (ops->ndo_fcoe_ddp_target) rc = ops->ndo_fcoe_ddp_target(real_dev, xid, sgl, sgc); return rc; } #endif #ifdef NETDEV_FCOE_WWNN static int vlan_dev_fcoe_get_wwn(struct net_device *dev, u64 *wwn, int type) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; const struct net_device_ops *ops = real_dev->netdev_ops; int rc = -EINVAL; if (ops->ndo_fcoe_get_wwn) rc = ops->ndo_fcoe_get_wwn(real_dev, wwn, type); return rc; } #endif static void vlan_dev_change_rx_flags(struct net_device *dev, int change) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; if (dev->flags & IFF_UP) { if (change & IFF_ALLMULTI) dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1); if (change & IFF_PROMISC) dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1); } } static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) { dev_mc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); } /* * vlan network devices have devices nesting below it, and are a special * "super class" of normal network devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key vlan_netdev_xmit_lock_key; static struct lock_class_key vlan_netdev_addr_lock_key; static void vlan_dev_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, void *unused) { lockdep_set_class(&txq->_xmit_lock, &vlan_netdev_xmit_lock_key); } static void vlan_dev_set_lockdep_class(struct net_device *dev) { lockdep_set_class(&dev->addr_list_lock, &vlan_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL); } static __be16 vlan_parse_protocol(const struct sk_buff *skb) { struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); return __vlan_get_protocol(skb, veth->h_vlan_proto, NULL); } static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .parse = eth_header_parse, .parse_protocol = vlan_parse_protocol, }; static int vlan_passthru_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; if (saddr == NULL) saddr = dev->dev_addr; return dev_hard_header(skb, real_dev, type, daddr, saddr, len); } static const struct header_ops vlan_passthru_header_ops = { .create = vlan_passthru_hard_header, .parse = eth_header_parse, .parse_protocol = vlan_parse_protocol, }; static struct device_type vlan_type = { .name = "vlan", }; static const struct net_device_ops vlan_netdev_ops; static int vlan_dev_init(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; netif_carrier_off(dev); /* IFF_BROADCAST|IFF_MULTICAST; ??? */ dev->flags = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_MASTER | IFF_SLAVE); dev->state = (real_dev->state & ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))) | (1<<__LINK_STATE_PRESENT); if (vlan->flags & VLAN_FLAG_BRIDGE_BINDING) dev->state |= (1 << __LINK_STATE_NOCARRIER); dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC | NETIF_F_ALL_FCOE; if (real_dev->vlan_features & NETIF_F_HW_MACSEC) dev->hw_features |= NETIF_F_HW_MACSEC; dev->features |= dev->hw_features | NETIF_F_LLTX; netif_inherit_tso_max(dev, real_dev); if (dev->features & NETIF_F_VLAN_FEATURES) netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE; dev->hw_enc_features = vlan_tnl_features(real_dev); dev->mpls_features = real_dev->mpls_features; /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; if (is_zero_ether_addr(dev->dev_addr)) { eth_hw_addr_set(dev, real_dev->dev_addr); dev->addr_assign_type = NET_ADDR_STOLEN; } if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); #if IS_ENABLED(CONFIG_FCOE) dev->fcoe_ddp_xid = real_dev->fcoe_ddp_xid; #endif dev->needed_headroom = real_dev->needed_headroom; if (vlan_hw_offload_capable(real_dev->features, vlan->vlan_proto)) { dev->header_ops = &vlan_passthru_header_ops; dev->hard_header_len = real_dev->hard_header_len; } else { dev->header_ops = &vlan_header_ops; dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN; } dev->netdev_ops = &vlan_netdev_ops; SET_NETDEV_DEVTYPE(dev, &vlan_type); vlan_dev_set_lockdep_class(dev); vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) return -ENOMEM; /* Get vlan's reference to real_dev */ netdev_hold(real_dev, &vlan->dev_tracker, GFP_KERNEL); return 0; } /* Note: this function might be called multiple times for the same device. */ void vlan_dev_free_egress_priority(const struct net_device *dev) { struct vlan_priority_tci_mapping *pm; struct vlan_dev_priv *vlan = vlan_dev_priv(dev); int i; for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) { while ((pm = vlan->egress_priority_map[i]) != NULL) { vlan->egress_priority_map[i] = pm->next; kfree(pm); } } } static void vlan_dev_uninit(struct net_device *dev) { vlan_dev_free_egress_priority(dev); } static netdev_features_t vlan_dev_fix_features(struct net_device *dev, netdev_features_t features) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; netdev_features_t old_features = features; netdev_features_t lower_features; lower_features = netdev_intersect_features((real_dev->vlan_features | NETIF_F_RXCSUM), real_dev->features); /* Add HW_CSUM setting to preserve user ability to control * checksum offload on the vlan device. */ if (lower_features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) lower_features |= NETIF_F_HW_CSUM; features = netdev_intersect_features(features, lower_features); features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE); features |= NETIF_F_LLTX; return features; } static int vlan_ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); return __ethtool_get_link_ksettings(vlan->real_dev, cmd); } static void vlan_ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, vlan_fullname, sizeof(info->driver)); strscpy(info->version, vlan_version, sizeof(info->version)); strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); } static int vlan_ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) { const struct vlan_dev_priv *vlan = vlan_dev_priv(dev); const struct ethtool_ops *ops = vlan->real_dev->ethtool_ops; struct phy_device *phydev = vlan->real_dev->phydev; if (phy_has_tsinfo(phydev)) { return phy_ts_info(phydev, info); } else if (ops->get_ts_info) { return ops->get_ts_info(vlan->real_dev, info); } else { info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; info->phc_index = -1; } return 0; } static void vlan_dev_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct vlan_pcpu_stats *p; u32 rx_errors = 0, tx_dropped = 0; int i; for_each_possible_cpu(i) { u64 rxpackets, rxbytes, rxmulticast, txpackets, txbytes; unsigned int start; p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i); do { start = u64_stats_fetch_begin(&p->syncp); rxpackets = u64_stats_read(&p->rx_packets); rxbytes = u64_stats_read(&p->rx_bytes); rxmulticast = u64_stats_read(&p->rx_multicast); txpackets = u64_stats_read(&p->tx_packets); txbytes = u64_stats_read(&p->tx_bytes); } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rxpackets; stats->rx_bytes += rxbytes; stats->multicast += rxmulticast; stats->tx_packets += txpackets; stats->tx_bytes += txbytes; /* rx_errors & tx_dropped are u32 */ rx_errors += READ_ONCE(p->rx_errors); tx_dropped += READ_ONCE(p->tx_dropped); } stats->rx_errors = rx_errors; stats->tx_dropped = tx_dropped; } #ifdef CONFIG_NET_POLL_CONTROLLER static void vlan_dev_poll_controller(struct net_device *dev) { return; } static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev = vlan->real_dev; struct netpoll *netpoll; int err = 0; netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); err = -ENOMEM; if (!netpoll) goto out; err = __netpoll_setup(netpoll, real_dev); if (err) { kfree(netpoll); goto out; } vlan->netpoll = netpoll; out: return err; } static void vlan_dev_netpoll_cleanup(struct net_device *dev) { struct vlan_dev_priv *vlan= vlan_dev_priv(dev); struct netpoll *netpoll = vlan->netpoll; if (!netpoll) return; vlan->netpoll = NULL; __netpoll_free(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ static int vlan_dev_get_iflink(const struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; return real_dev->ifindex; } static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct vlan_dev_priv *vlan = vlan_dev_priv(ctx->dev); path->type = DEV_PATH_VLAN; path->encap.id = vlan->vlan_id; path->encap.proto = vlan->vlan_proto; path->dev = ctx->dev; ctx->dev = vlan->real_dev; if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan)) return -ENOSPC; ctx->vlan[ctx->num_vlans].id = vlan->vlan_id; ctx->vlan[ctx->num_vlans].proto = vlan->vlan_proto; ctx->num_vlans++; return 0; } #if IS_ENABLED(CONFIG_MACSEC) static const struct macsec_ops *vlan_get_macsec_ops(const struct macsec_context *ctx) { return vlan_dev_priv(ctx->netdev)->real_dev->macsec_ops; } static int vlan_macsec_offload(int (* const func)(struct macsec_context *), struct macsec_context *ctx) { if (unlikely(!func)) return 0; return (*func)(ctx); } static int vlan_macsec_dev_open(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_dev_open, ctx); } static int vlan_macsec_dev_stop(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_dev_stop, ctx); } static int vlan_macsec_add_secy(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_secy, ctx); } static int vlan_macsec_upd_secy(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_secy, ctx); } static int vlan_macsec_del_secy(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_secy, ctx); } static int vlan_macsec_add_rxsc(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_rxsc, ctx); } static int vlan_macsec_upd_rxsc(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_rxsc, ctx); } static int vlan_macsec_del_rxsc(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_rxsc, ctx); } static int vlan_macsec_add_rxsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_rxsa, ctx); } static int vlan_macsec_upd_rxsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_rxsa, ctx); } static int vlan_macsec_del_rxsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_rxsa, ctx); } static int vlan_macsec_add_txsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_add_txsa, ctx); } static int vlan_macsec_upd_txsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_upd_txsa, ctx); } static int vlan_macsec_del_txsa(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_del_txsa, ctx); } static int vlan_macsec_get_dev_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_dev_stats, ctx); } static int vlan_macsec_get_tx_sc_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_tx_sc_stats, ctx); } static int vlan_macsec_get_tx_sa_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_tx_sa_stats, ctx); } static int vlan_macsec_get_rx_sc_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_rx_sc_stats, ctx); } static int vlan_macsec_get_rx_sa_stats(struct macsec_context *ctx) { const struct macsec_ops *ops = vlan_get_macsec_ops(ctx); if (!ops) return -EOPNOTSUPP; return vlan_macsec_offload(ops->mdo_get_rx_sa_stats, ctx); } static const struct macsec_ops macsec_offload_ops = { /* Device wide */ .mdo_dev_open = vlan_macsec_dev_open, .mdo_dev_stop = vlan_macsec_dev_stop, /* SecY */ .mdo_add_secy = vlan_macsec_add_secy, .mdo_upd_secy = vlan_macsec_upd_secy, .mdo_del_secy = vlan_macsec_del_secy, /* Security channels */ .mdo_add_rxsc = vlan_macsec_add_rxsc, .mdo_upd_rxsc = vlan_macsec_upd_rxsc, .mdo_del_rxsc = vlan_macsec_del_rxsc, /* Security associations */ .mdo_add_rxsa = vlan_macsec_add_rxsa, .mdo_upd_rxsa = vlan_macsec_upd_rxsa, .mdo_del_rxsa = vlan_macsec_del_rxsa, .mdo_add_txsa = vlan_macsec_add_txsa, .mdo_upd_txsa = vlan_macsec_upd_txsa, .mdo_del_txsa = vlan_macsec_del_txsa, /* Statistics */ .mdo_get_dev_stats = vlan_macsec_get_dev_stats, .mdo_get_tx_sc_stats = vlan_macsec_get_tx_sc_stats, .mdo_get_tx_sa_stats = vlan_macsec_get_tx_sa_stats, .mdo_get_rx_sc_stats = vlan_macsec_get_rx_sc_stats, .mdo_get_rx_sa_stats = vlan_macsec_get_rx_sa_stats, }; #endif static const struct ethtool_ops vlan_ethtool_ops = { .get_link_ksettings = vlan_ethtool_get_link_ksettings, .get_drvinfo = vlan_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_ts_info = vlan_ethtool_get_ts_info, }; static const struct net_device_ops vlan_netdev_ops = { .ndo_change_mtu = vlan_dev_change_mtu, .ndo_init = vlan_dev_init, .ndo_uninit = vlan_dev_uninit, .ndo_open = vlan_dev_open, .ndo_stop = vlan_dev_stop, .ndo_start_xmit = vlan_dev_hard_start_xmit, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = vlan_dev_set_mac_address, .ndo_set_rx_mode = vlan_dev_set_rx_mode, .ndo_change_rx_flags = vlan_dev_change_rx_flags, .ndo_eth_ioctl = vlan_dev_ioctl, .ndo_neigh_setup = vlan_dev_neigh_setup, .ndo_get_stats64 = vlan_dev_get_stats64, #if IS_ENABLED(CONFIG_FCOE) .ndo_fcoe_ddp_setup = vlan_dev_fcoe_ddp_setup, .ndo_fcoe_ddp_done = vlan_dev_fcoe_ddp_done, .ndo_fcoe_enable = vlan_dev_fcoe_enable, .ndo_fcoe_disable = vlan_dev_fcoe_disable, .ndo_fcoe_ddp_target = vlan_dev_fcoe_ddp_target, #endif #ifdef NETDEV_FCOE_WWNN .ndo_fcoe_get_wwn = vlan_dev_fcoe_get_wwn, #endif #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = vlan_dev_poll_controller, .ndo_netpoll_setup = vlan_dev_netpoll_setup, .ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup, #endif .ndo_fix_features = vlan_dev_fix_features, .ndo_get_iflink = vlan_dev_get_iflink, .ndo_fill_forward_path = vlan_dev_fill_forward_path, .ndo_hwtstamp_get = vlan_hwtstamp_get, .ndo_hwtstamp_set = vlan_hwtstamp_set, }; static void vlan_dev_free(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); free_percpu(vlan->vlan_pcpu_stats); vlan->vlan_pcpu_stats = NULL; /* Get rid of the vlan's reference to real_dev */ netdev_put(vlan->real_dev, &vlan->dev_tracker); } void vlan_setup(struct net_device *dev) { ether_setup(dev); dev->priv_flags |= IFF_802_1Q_VLAN | IFF_NO_QUEUE; dev->priv_flags |= IFF_UNICAST_FLT; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); dev->netdev_ops = &vlan_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = vlan_dev_free; dev->ethtool_ops = &vlan_ethtool_ops; #if IS_ENABLED(CONFIG_MACSEC) dev->macsec_ops = &macsec_offload_ops; #endif dev->min_mtu = 0; dev->max_mtu = ETH_MAX_MTU; eth_zero_addr(dev->broadcast); }
3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 // SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Monitoring SMC transport protocol sockets * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/init.h> #include <linux/sock_diag.h> #include <linux/inet_diag.h> #include <linux/smc_diag.h> #include <net/netlink.h> #include <net/smc.h> #include "smc.h" #include "smc_core.h" struct smc_diag_dump_ctx { int pos[2]; }; static struct smc_diag_dump_ctx *smc_dump_context(struct netlink_callback *cb) { return (struct smc_diag_dump_ctx *)cb->ctx; } static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) { struct smc_sock *smc = smc_sk(sk); memset(r, 0, sizeof(*r)); r->diag_family = sk->sk_family; sock_diag_save_cookie(sk, r->id.idiag_cookie); if (!smc->clcsock) return; r->id.idiag_sport = htons(smc->clcsock->sk->sk_num); r->id.idiag_dport = smc->clcsock->sk->sk_dport; r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if; if (sk->sk_protocol == SMCPROTO_SMC) { r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; #if IS_ENABLED(CONFIG_IPV6) } else if (sk->sk_protocol == SMCPROTO_SMC6) { memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr, sizeof(smc->clcsock->sk->sk_v6_rcv_saddr)); memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr, sizeof(smc->clcsock->sk->sk_v6_daddr)); #endif } } static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct smc_diag_msg *r, struct user_namespace *user_ns) { if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown)) return 1; r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->diag_inode = sock_i_ino(sk); return 0; } static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct smc_diag_req *req, struct nlattr *bc) { struct smc_sock *smc = smc_sk(sk); struct smc_diag_fallback fallback; struct user_namespace *user_ns; struct smc_diag_msg *r; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); smc_diag_msg_common_fill(r, sk); r->diag_state = sk->sk_state; if (smc->use_fallback) r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP; else if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd) r->diag_mode = SMC_DIAG_MODE_SMCD; else r->diag_mode = SMC_DIAG_MODE_SMCR; user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk); if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns)) goto errout; fallback.reason = smc->fallback_rsn; fallback.peer_diagnosis = smc->peer_diagnosis; if (nla_put(skb, SMC_DIAG_FALLBACK, sizeof(fallback), &fallback) < 0) goto errout; if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.alert_token_local) { struct smc_connection *conn = &smc->conn; struct smc_diag_conninfo cinfo = { .token = conn->alert_token_local, .sndbuf_size = conn->sndbuf_desc ? conn->sndbuf_desc->len : 0, .rmbe_size = conn->rmb_desc ? conn->rmb_desc->len : 0, .peer_rmbe_size = conn->peer_rmbe_size, .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap, .rx_prod.count = conn->local_rx_ctrl.prod.count, .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap, .rx_cons.count = conn->local_rx_ctrl.cons.count, .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap, .tx_prod.count = conn->local_tx_ctrl.prod.count, .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap, .tx_cons.count = conn->local_tx_ctrl.cons.count, .tx_prod_flags = *(u8 *)&conn->local_tx_ctrl.prod_flags, .tx_conn_state_flags = *(u8 *)&conn->local_tx_ctrl.conn_state_flags, .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags, .rx_conn_state_flags = *(u8 *)&conn->local_rx_ctrl.conn_state_flags, .tx_prep.wrap = conn->tx_curs_prep.wrap, .tx_prep.count = conn->tx_curs_prep.count, .tx_sent.wrap = conn->tx_curs_sent.wrap, .tx_sent.count = conn->tx_curs_sent.count, .tx_fin.wrap = conn->tx_curs_fin.wrap, .tx_fin.count = conn->tx_curs_fin.count, }; if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0) goto errout; } if (smc_conn_lgr_valid(&smc->conn) && !smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_link *link = smc->conn.lnk; struct smc_diag_lgrinfo linfo = { .role = smc->conn.lgr->role, .lnk[0].ibport = link->ibport, .lnk[0].link_id = link->link_id, }; memcpy(linfo.lnk[0].ibname, smc->conn.lgr->lnk[0].smcibdev->ibdev->name, sizeof(link->smcibdev->ibdev->name)); smc_gid_be16_convert(linfo.lnk[0].gid, link->gid); smc_gid_be16_convert(linfo.lnk[0].peer_gid, link->peer_gid); if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0) goto errout; } if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && !list_empty(&smc->conn.lgr->list)) { struct smc_connection *conn = &smc->conn; struct smcd_diag_dmbinfo dinfo; struct smcd_dev *smcd = conn->lgr->smcd; memset(&dinfo, 0, sizeof(dinfo)); dinfo.linkid = *((u32 *)conn->lgr->id); dinfo.peer_gid = conn->lgr->peer_gid; dinfo.my_gid = smcd->ops->get_local_gid(smcd); dinfo.token = conn->rmb_desc->token; dinfo.peer_token = conn->peer_token; if (nla_put(skb, SMC_DIAG_DMBINFO, sizeof(dinfo), &dinfo) < 0) goto errout; } nlmsg_end(skb, nlh); return 0; errout: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, struct netlink_callback *cb, int p_type) { struct smc_diag_dump_ctx *cb_ctx = smc_dump_context(cb); struct net *net = sock_net(skb->sk); int snum = cb_ctx->pos[p_type]; struct nlattr *bc = NULL; struct hlist_head *head; int rc = 0, num = 0; struct sock *sk; read_lock(&prot->h.smc_hash->lock); head = &prot->h.smc_hash->ht; if (hlist_empty(head)) goto out; sk_for_each(sk, head) { if (!net_eq(sock_net(sk), net)) continue; if (num < snum) goto next; rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc); if (rc < 0) goto out; next: num++; } out: read_unlock(&prot->h.smc_hash->lock); cb_ctx->pos[p_type] = num; return rc; } static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { int rc = 0; rc = smc_diag_dump_proto(&smc_proto, skb, cb, SMCPROTO_SMC); if (!rc) smc_diag_dump_proto(&smc_proto6, skb, cb, SMCPROTO_SMC6); return skb->len; } static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { struct net *net = sock_net(skb->sk); if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && h->nlmsg_flags & NLM_F_DUMP) { { struct netlink_dump_control c = { .dump = smc_diag_dump, .min_dump_alloc = SKB_WITH_OVERHEAD(32768), }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } } return 0; } static const struct sock_diag_handler smc_diag_handler = { .family = AF_SMC, .dump = smc_diag_handler_dump, }; static int __init smc_diag_init(void) { return sock_diag_register(&smc_diag_handler); } static void __exit smc_diag_exit(void) { sock_diag_unregister(&smc_diag_handler); } module_init(smc_diag_init); module_exit(smc_diag_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */); MODULE_ALIAS_GENL_FAMILY(SMCR_GENL_FAMILY_NAME);
3987 1640 1624 1624 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIME64_H #define _LINUX_TIME64_H #include <linux/math64.h> #include <vdso/time64.h> typedef __s64 time64_t; typedef __u64 timeu64_t; #include <uapi/linux/time.h> struct timespec64 { time64_t tv_sec; /* seconds */ long tv_nsec; /* nanoseconds */ }; struct itimerspec64 { struct timespec64 it_interval; struct timespec64 it_value; }; /* Parameters used to convert the timespec values: */ #define PSEC_PER_NSEC 1000L /* Located here for timespec[64]_valid_strict */ #define TIME64_MAX ((s64)~((u64)1 << 63)) #define TIME64_MIN (-TIME64_MAX - 1) #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_MIN (-KTIME_MAX - 1) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) #define KTIME_SEC_MIN (KTIME_MIN / NSEC_PER_SEC) /* * Limits for settimeofday(): * * To prevent setting the time close to the wraparound point time setting * is limited so a reasonable uptime can be accomodated. Uptime of 30 years * should be really sufficient, which means the cutoff is 2232. At that * point the cutoff is just a small part of the larger problem. */ #define TIME_UPTIME_SEC_MAX (30LL * 365 * 24 *3600) #define TIME_SETTOD_SEC_MAX (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX) static inline int timespec64_equal(const struct timespec64 *a, const struct timespec64 *b) { return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); } /* * lhs < rhs: return <0 * lhs == rhs: return 0 * lhs > rhs: return >0 */ static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs) { if (lhs->tv_sec < rhs->tv_sec) return -1; if (lhs->tv_sec > rhs->tv_sec) return 1; return lhs->tv_nsec - rhs->tv_nsec; } extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec); static inline struct timespec64 timespec64_add(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec); return ts_delta; } /* * sub = lhs - rhs, in normalized form */ static inline struct timespec64 timespec64_sub(struct timespec64 lhs, struct timespec64 rhs) { struct timespec64 ts_delta; set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec, lhs.tv_nsec - rhs.tv_nsec); return ts_delta; } /* * Returns true if the timespec64 is norm, false if denorm: */ static inline bool timespec64_valid(const struct timespec64 *ts) { /* Dates before 1970 are bogus */ if (ts->tv_sec < 0) return false; /* Can't have more nanoseconds then a second */ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return false; return true; } static inline bool timespec64_valid_strict(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values that could overflow ktime_t */ if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) return false; return true; } static inline bool timespec64_valid_settod(const struct timespec64 *ts) { if (!timespec64_valid(ts)) return false; /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */ if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX) return false; return true; } /** * timespec64_to_ns - Convert timespec64 to nanoseconds * @ts: pointer to the timespec64 variable to be converted * * Returns the scalar nanosecond representation of the timespec64 * parameter. */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { /* Prevent multiplication overflow / underflow */ if (ts->tv_sec >= KTIME_SEC_MAX) return KTIME_MAX; if (ts->tv_sec <= KTIME_SEC_MIN) return KTIME_MIN; return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } /** * ns_to_timespec64 - Convert nanoseconds to timespec64 * @nsec: the nanoseconds value to be converted * * Returns the timespec64 representation of the nsec parameter. */ extern struct timespec64 ns_to_timespec64(s64 nsec); /** * timespec64_add_ns - Adds nanoseconds to a timespec64 * @a: pointer to timespec64 to be incremented * @ns: unsigned nanoseconds value to be added * * This must always be inlined because its used from the x86-64 vdso, * which cannot call other kernel functions. */ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns) { a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); a->tv_nsec = ns; } /* * timespec64_add_safe assumes both values are positive and checks for * overflow. It will return TIME64_MAX in case of overflow. */ extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs, const struct timespec64 rhs); #endif /* _LINUX_TIME64_H */
1429 1429 1429 1429 1429 1590 1589 1589 267 1426 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/fsync.c * * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) * from * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * from * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds * * ext4fs fsync primitive * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * * Removed unnecessary code duplication for little endian machines * and excessive __inline__s. * Andi Kleen, 1997 * * Major simplications and cleanup - we only need to do the metadata, because * we can depend on generic_block_fdatasync() to sync the data blocks. */ #include <linux/time.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" #include <trace/events/ext4.h> /* * If we're not journaling and this is a just-created file, we have to * sync our parent directory (if it was freshly created) since * otherwise it will only be written by writeback, leaving a huge * window during which a crash may lose the file. This may apply for * the parent directory's parent as well, and so on recursively, if * they are also freshly created. */ static int ext4_sync_parent(struct inode *inode) { struct dentry *dentry, *next; int ret = 0; if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) return 0; dentry = d_find_any_alias(inode); if (!dentry) return 0; while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); next = dget_parent(dentry); dput(dentry); dentry = next; inode = dentry->d_inode; /* * The directory inode may have gone through rmdir by now. But * the inode itself and its blocks are still allocated (we hold * a reference to the inode via its dentry), so it didn't go * through ext4_evict_inode()) and so we are safe to flush * metadata blocks and the inode. */ ret = sync_mapping_buffers(inode->i_mapping); if (ret) break; ret = sync_inode_metadata(inode, 1); if (ret) break; } dput(dentry); return ret; } static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end, int datasync, bool *needs_barrier) { struct inode *inode = file->f_inode; int ret; ret = generic_buffers_fsync_noflush(file, start, end, datasync); if (!ret) ret = ext4_sync_parent(inode); if (test_opt(inode->i_sb, BARRIER)) *needs_barrier = true; return ret; } static int ext4_fsync_journal(struct inode *inode, bool datasync, bool *needs_barrier) { struct ext4_inode_info *ei = EXT4_I(inode); journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; /* * Fastcommit does not really support fsync on directories or other * special files. Force a full commit. */ if (!S_ISREG(inode->i_mode)) return ext4_force_commit(inode->i_sb); if (journal->j_flags & JBD2_BARRIER && !jbd2_trans_will_send_data_barrier(journal, commit_tid)) *needs_barrier = true; return ext4_fc_commit(journal, commit_tid); } /* * akpm: A new design for ext4_sync_file(). * * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). * There cannot be a transaction open by this task. * Another task could have dirtied this inode. Its data can be in any * state in the journalling system. * * What we do is just kick off a commit and wait on it. This will snapshot the * inode to disk. */ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { int ret = 0, err; bool needs_barrier = false; struct inode *inode = file->f_mapping->host; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file_enter(file, datasync); if (sb_rdonly(inode->i_sb)) { /* Make sure that we read updated s_ext4_flags value */ smp_rmb(); if (ext4_forced_shutdown(inode->i_sb)) ret = -EROFS; goto out; } if (!EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fsync_nojournal(file, start, end, datasync, &needs_barrier); if (needs_barrier) goto issue_flush; goto out; } ret = file_write_and_wait_range(file, start, end); if (ret) goto out; /* * The caller's filemap_fdatawrite()/wait will sync the data. * Metadata is in the journal, we wait for proper transaction to * commit here. */ ret = ext4_fsync_journal(inode, datasync, &needs_barrier); issue_flush: if (needs_barrier) { err = blkdev_issue_flush(inode->i_sb->s_bdev); if (!ret) ret = err; } out: err = file_check_and_advance_wb_err(file); if (ret == 0) ret = err; trace_ext4_sync_file_exit(inode, ret); return ret; }
2 96 96 96 96 96 96 96 96 96 96 96 96 96 96 96 96 96 96 96 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 96 96 96 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2017 Intel Deutschland GmbH * Copyright (C) 2022 Intel Corporation */ #include <linux/kernel.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/slab.h> #include "rate.h" #include "ieee80211_i.h" #include "debugfs.h" struct rate_control_alg { struct list_head list; const struct rate_control_ops *ops; }; static LIST_HEAD(rate_ctrl_algs); static DEFINE_MUTEX(rate_ctrl_mutex); static char *ieee80211_default_rc_algo = CONFIG_MAC80211_RC_DEFAULT; module_param(ieee80211_default_rc_algo, charp, 0644); MODULE_PARM_DESC(ieee80211_default_rc_algo, "Default rate control algorithm for mac80211 to use"); void rate_control_rate_init(struct sta_info *sta) { struct ieee80211_local *local = sta->sdata->local; struct rate_control_ref *ref = sta->rate_ctrl; struct ieee80211_sta *ista = &sta->sta; void *priv_sta = sta->rate_ctrl_priv; struct ieee80211_supported_band *sband; struct ieee80211_chanctx_conf *chanctx_conf; ieee80211_sta_set_rx_nss(&sta->deflink); if (!ref) return; rcu_read_lock(); chanctx_conf = rcu_dereference(sta->sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); return; } sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band]; /* TODO: check for minstrel_s1g ? */ if (sband->band == NL80211_BAND_S1GHZ) { ieee80211_s1g_sta_rate_init(sta); rcu_read_unlock(); return; } spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->rate_init(ref->priv, sband, &chanctx_conf->def, ista, priv_sta); spin_unlock_bh(&sta->rate_ctrl_lock); rcu_read_unlock(); set_sta_flag(sta, WLAN_STA_RATE_CONTROL); } void rate_control_tx_status(struct ieee80211_local *local, struct ieee80211_tx_status *st) { struct rate_control_ref *ref = local->rate_ctrl; struct sta_info *sta = container_of(st->sta, struct sta_info, sta); void *priv_sta = sta->rate_ctrl_priv; struct ieee80211_supported_band *sband; if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) return; sband = local->hw.wiphy->bands[st->info->band]; spin_lock_bh(&sta->rate_ctrl_lock); if (ref->ops->tx_status_ext) ref->ops->tx_status_ext(ref->priv, sband, priv_sta, st); else if (st->skb) ref->ops->tx_status(ref->priv, sband, st->sta, priv_sta, st->skb); else WARN_ON_ONCE(1); spin_unlock_bh(&sta->rate_ctrl_lock); } void rate_control_rate_update(struct ieee80211_local *local, struct ieee80211_supported_band *sband, struct sta_info *sta, unsigned int link_id, u32 changed) { struct rate_control_ref *ref = local->rate_ctrl; struct ieee80211_sta *ista = &sta->sta; void *priv_sta = sta->rate_ctrl_priv; struct ieee80211_chanctx_conf *chanctx_conf; WARN_ON(link_id != 0); if (ref && ref->ops->rate_update) { rcu_read_lock(); chanctx_conf = rcu_dereference(sta->sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); return; } spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->rate_update(ref->priv, sband, &chanctx_conf->def, ista, priv_sta, changed); spin_unlock_bh(&sta->rate_ctrl_lock); rcu_read_unlock(); } drv_sta_rc_update(local, sta->sdata, &sta->sta, changed); } int ieee80211_rate_control_register(const struct rate_control_ops *ops) { struct rate_control_alg *alg; if (!ops->name) return -EINVAL; mutex_lock(&rate_ctrl_mutex); list_for_each_entry(alg, &rate_ctrl_algs, list) { if (!strcmp(alg->ops->name, ops->name)) { /* don't register an algorithm twice */ WARN_ON(1); mutex_unlock(&rate_ctrl_mutex); return -EALREADY; } } alg = kzalloc(sizeof(*alg), GFP_KERNEL); if (alg == NULL) { mutex_unlock(&rate_ctrl_mutex); return -ENOMEM; } alg->ops = ops; list_add_tail(&alg->list, &rate_ctrl_algs); mutex_unlock(&rate_ctrl_mutex); return 0; } EXPORT_SYMBOL(ieee80211_rate_control_register); void ieee80211_rate_control_unregister(const struct rate_control_ops *ops) { struct rate_control_alg *alg; mutex_lock(&rate_ctrl_mutex); list_for_each_entry(alg, &rate_ctrl_algs, list) { if (alg->ops == ops) { list_del(&alg->list); kfree(alg); break; } } mutex_unlock(&rate_ctrl_mutex); } EXPORT_SYMBOL(ieee80211_rate_control_unregister); static const struct rate_control_ops * ieee80211_try_rate_control_ops_get(const char *name) { struct rate_control_alg *alg; const struct rate_control_ops *ops = NULL; if (!name) return NULL; mutex_lock(&rate_ctrl_mutex); list_for_each_entry(alg, &rate_ctrl_algs, list) { if (!strcmp(alg->ops->name, name)) { ops = alg->ops; break; } } mutex_unlock(&rate_ctrl_mutex); return ops; } /* Get the rate control algorithm. */ static const struct rate_control_ops * ieee80211_rate_control_ops_get(const char *name) { const struct rate_control_ops *ops; const char *alg_name; kernel_param_lock(THIS_MODULE); if (!name) alg_name = ieee80211_default_rc_algo; else alg_name = name; ops = ieee80211_try_rate_control_ops_get(alg_name); if (!ops && name) /* try default if specific alg requested but not found */ ops = ieee80211_try_rate_control_ops_get(ieee80211_default_rc_algo); /* Note: check for > 0 is intentional to avoid clang warning */ if (!ops && (strlen(CONFIG_MAC80211_RC_DEFAULT) > 0)) /* try built-in one if specific alg requested but not found */ ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT); kernel_param_unlock(THIS_MODULE); return ops; } #ifdef CONFIG_MAC80211_DEBUGFS static ssize_t rcname_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct rate_control_ref *ref = file->private_data; int len = strlen(ref->ops->name); return simple_read_from_buffer(userbuf, count, ppos, ref->ops->name, len); } const struct file_operations rcname_ops = { .read = rcname_read, .open = simple_open, .llseek = default_llseek, }; #endif static struct rate_control_ref * rate_control_alloc(const char *name, struct ieee80211_local *local) { struct rate_control_ref *ref; ref = kmalloc(sizeof(struct rate_control_ref), GFP_KERNEL); if (!ref) return NULL; ref->ops = ieee80211_rate_control_ops_get(name); if (!ref->ops) goto free; ref->priv = ref->ops->alloc(&local->hw); if (!ref->priv) goto free; return ref; free: kfree(ref); return NULL; } static void rate_control_free(struct ieee80211_local *local, struct rate_control_ref *ctrl_ref) { ctrl_ref->ops->free(ctrl_ref->priv); #ifdef CONFIG_MAC80211_DEBUGFS debugfs_remove_recursive(local->debugfs.rcdir); local->debugfs.rcdir = NULL; #endif kfree(ctrl_ref); } void ieee80211_check_rate_mask(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; u32 user_mask, basic_rates = link->conf->basic_rates; enum nl80211_band band; if (WARN_ON(!link->conf->chandef.chan)) return; band = link->conf->chandef.chan->band; if (band == NL80211_BAND_S1GHZ) { /* TODO */ return; } if (WARN_ON_ONCE(!basic_rates)) return; user_mask = sdata->rc_rateidx_mask[band]; sband = local->hw.wiphy->bands[band]; if (user_mask & basic_rates) return; sdata_dbg(sdata, "no overlap between basic rates (0x%x) and user mask (0x%x on band %d) - clearing the latter", basic_rates, user_mask, band); sdata->rc_rateidx_mask[band] = (1 << sband->n_bitrates) - 1; } static bool rc_no_data_or_no_ack_use_min(struct ieee80211_tx_rate_control *txrc) { struct sk_buff *skb = txrc->skb; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); return (info->flags & (IEEE80211_TX_CTL_NO_ACK | IEEE80211_TX_CTL_USE_MINRATE)) || !ieee80211_is_tx_data(skb); } static void rc_send_low_basicrate(struct ieee80211_tx_rate *rate, u32 basic_rates, struct ieee80211_supported_band *sband) { u8 i; if (sband->band == NL80211_BAND_S1GHZ) { /* TODO */ rate->flags |= IEEE80211_TX_RC_S1G_MCS; rate->idx = 0; return; } if (basic_rates == 0) return; /* assume basic rates unknown and accept rate */ if (rate->idx < 0) return; if (basic_rates & (1 << rate->idx)) return; /* selected rate is a basic rate */ for (i = rate->idx + 1; i <= sband->n_bitrates; i++) { if (basic_rates & (1 << i)) { rate->idx = i; return; } } /* could not find a basic rate; use original selection */ } static void __rate_control_send_low(struct ieee80211_hw *hw, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta, struct ieee80211_tx_info *info, u32 rate_mask) { int i; u32 rate_flags = ieee80211_chandef_rate_flags(&hw->conf.chandef); if (sband->band == NL80211_BAND_S1GHZ) { info->control.rates[0].flags |= IEEE80211_TX_RC_S1G_MCS; info->control.rates[0].idx = 0; return; } if ((sband->band == NL80211_BAND_2GHZ) && (info->flags & IEEE80211_TX_CTL_NO_CCK_RATE)) rate_flags |= IEEE80211_RATE_ERP_G; info->control.rates[0].idx = 0; for (i = 0; i < sband->n_bitrates; i++) { if (!(rate_mask & BIT(i))) continue; if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; if (!rate_supported(sta, sband->band, i)) continue; info->control.rates[0].idx = i; break; } WARN_ONCE(i == sband->n_bitrates, "no supported rates for sta %pM (0x%x, band %d) in rate_mask 0x%x with flags 0x%x\n", sta ? sta->addr : NULL, sta ? sta->deflink.supp_rates[sband->band] : -1, sband->band, rate_mask, rate_flags); info->control.rates[0].count = (info->flags & IEEE80211_TX_CTL_NO_ACK) ? 1 : hw->max_rate_tries; info->control.skip_table = 1; } static bool rate_control_send_low(struct ieee80211_sta *pubsta, struct ieee80211_tx_rate_control *txrc) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); struct ieee80211_supported_band *sband = txrc->sband; struct sta_info *sta; int mcast_rate; bool use_basicrate = false; if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); if (!pubsta && txrc->bss) { mcast_rate = txrc->bss_conf->mcast_rate[sband->band]; if (mcast_rate > 0) { info->control.rates[0].idx = mcast_rate - 1; return true; } use_basicrate = true; } else if (pubsta) { sta = container_of(pubsta, struct sta_info, sta); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) use_basicrate = true; } if (use_basicrate) rc_send_low_basicrate(&info->control.rates[0], txrc->bss_conf->basic_rates, sband); return true; } return false; } static bool rate_idx_match_legacy_mask(s8 *rate_idx, int n_bitrates, u32 mask) { int j; /* See whether the selected rate or anything below it is allowed. */ for (j = *rate_idx; j >= 0; j--) { if (mask & (1 << j)) { /* Okay, found a suitable rate. Use it. */ *rate_idx = j; return true; } } /* Try to find a higher rate that would be allowed */ for (j = *rate_idx + 1; j < n_bitrates; j++) { if (mask & (1 << j)) { /* Okay, found a suitable rate. Use it. */ *rate_idx = j; return true; } } return false; } static bool rate_idx_match_mcs_mask(s8 *rate_idx, u8 *mcs_mask) { int i, j; int ridx, rbit; ridx = *rate_idx / 8; rbit = *rate_idx % 8; /* sanity check */ if (ridx < 0 || ridx >= IEEE80211_HT_MCS_MASK_LEN) return false; /* See whether the selected rate or anything below it is allowed. */ for (i = ridx; i >= 0; i--) { for (j = rbit; j >= 0; j--) if (mcs_mask[i] & BIT(j)) { *rate_idx = i * 8 + j; return true; } rbit = 7; } /* Try to find a higher rate that would be allowed */ ridx = (*rate_idx + 1) / 8; rbit = (*rate_idx + 1) % 8; for (i = ridx; i < IEEE80211_HT_MCS_MASK_LEN; i++) { for (j = rbit; j < 8; j++) if (mcs_mask[i] & BIT(j)) { *rate_idx = i * 8 + j; return true; } rbit = 0; } return false; } static bool rate_idx_match_vht_mcs_mask(s8 *rate_idx, u16 *vht_mask) { int i, j; int ridx, rbit; ridx = *rate_idx >> 4; rbit = *rate_idx & 0xf; if (ridx < 0 || ridx >= NL80211_VHT_NSS_MAX) return false; /* See whether the selected rate or anything below it is allowed. */ for (i = ridx; i >= 0; i--) { for (j = rbit; j >= 0; j--) { if (vht_mask[i] & BIT(j)) { *rate_idx = (i << 4) | j; return true; } } rbit = 15; } /* Try to find a higher rate that would be allowed */ ridx = (*rate_idx + 1) >> 4; rbit = (*rate_idx + 1) & 0xf; for (i = ridx; i < NL80211_VHT_NSS_MAX; i++) { for (j = rbit; j < 16; j++) { if (vht_mask[i] & BIT(j)) { *rate_idx = (i << 4) | j; return true; } } rbit = 0; } return false; } static void rate_idx_match_mask(s8 *rate_idx, u16 *rate_flags, struct ieee80211_supported_band *sband, enum nl80211_chan_width chan_width, u32 mask, u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN], u16 vht_mask[NL80211_VHT_NSS_MAX]) { if (*rate_flags & IEEE80211_TX_RC_VHT_MCS) { /* handle VHT rates */ if (rate_idx_match_vht_mcs_mask(rate_idx, vht_mask)) return; *rate_idx = 0; /* keep protection flags */ *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS | IEEE80211_TX_RC_USE_CTS_PROTECT | IEEE80211_TX_RC_USE_SHORT_PREAMBLE); *rate_flags |= IEEE80211_TX_RC_MCS; if (chan_width == NL80211_CHAN_WIDTH_40) *rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) return; /* also try the legacy rates. */ *rate_flags &= ~(IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_40_MHZ_WIDTH); if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, mask)) return; } else if (*rate_flags & IEEE80211_TX_RC_MCS) { /* handle HT rates */ if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) return; /* also try the legacy rates. */ *rate_idx = 0; /* keep protection flags */ *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS | IEEE80211_TX_RC_USE_CTS_PROTECT | IEEE80211_TX_RC_USE_SHORT_PREAMBLE); if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, mask)) return; } else { /* handle legacy rates */ if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, mask)) return; /* if HT BSS, and we handle a data frame, also try HT rates */ switch (chan_width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: return; default: break; } *rate_idx = 0; /* keep protection flags */ *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS | IEEE80211_TX_RC_USE_CTS_PROTECT | IEEE80211_TX_RC_USE_SHORT_PREAMBLE); *rate_flags |= IEEE80211_TX_RC_MCS; if (chan_width == NL80211_CHAN_WIDTH_40) *rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) return; } /* * Uh.. No suitable rate exists. This should not really happen with * sane TX rate mask configurations. However, should someone manage to * configure supported rates and TX rate mask in incompatible way, * allow the frame to be transmitted with whatever the rate control * selected. */ } static void rate_fixup_ratelist(struct ieee80211_vif *vif, struct ieee80211_supported_band *sband, struct ieee80211_tx_info *info, struct ieee80211_tx_rate *rates, int max_rates) { struct ieee80211_rate *rate; bool inval = false; int i; /* * Set up the RTS/CTS rate as the fastest basic rate * that is not faster than the data rate unless there * is no basic rate slower than the data rate, in which * case we pick the slowest basic rate * * XXX: Should this check all retry rates? */ if (!(rates[0].flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) { u32 basic_rates = vif->bss_conf.basic_rates; s8 baserate = basic_rates ? ffs(basic_rates) - 1 : 0; rate = &sband->bitrates[rates[0].idx]; for (i = 0; i < sband->n_bitrates; i++) { /* must be a basic rate */ if (!(basic_rates & BIT(i))) continue; /* must not be faster than the data rate */ if (sband->bitrates[i].bitrate > rate->bitrate) continue; /* maximum */ if (sband->bitrates[baserate].bitrate < sband->bitrates[i].bitrate) baserate = i; } info->control.rts_cts_rate_idx = baserate; } for (i = 0; i < max_rates; i++) { /* * make sure there's no valid rate following * an invalid one, just in case drivers don't * take the API seriously to stop at -1. */ if (inval) { rates[i].idx = -1; continue; } if (rates[i].idx < 0) { inval = true; continue; } /* * For now assume MCS is already set up correctly, this * needs to be fixed. */ if (rates[i].flags & IEEE80211_TX_RC_MCS) { WARN_ON(rates[i].idx > 76); if (!(rates[i].flags & IEEE80211_TX_RC_USE_RTS_CTS) && info->control.use_cts_prot) rates[i].flags |= IEEE80211_TX_RC_USE_CTS_PROTECT; continue; } if (rates[i].flags & IEEE80211_TX_RC_VHT_MCS) { WARN_ON(ieee80211_rate_get_vht_mcs(&rates[i]) > 9); continue; } /* set up RTS protection if desired */ if (info->control.use_rts) { rates[i].flags |= IEEE80211_TX_RC_USE_RTS_CTS; info->control.use_cts_prot = false; } /* RC is busted */ if (WARN_ON_ONCE(rates[i].idx >= sband->n_bitrates)) { rates[i].idx = -1; continue; } rate = &sband->bitrates[rates[i].idx]; /* set up short preamble */ if (info->control.short_preamble && rate->flags & IEEE80211_RATE_SHORT_PREAMBLE) rates[i].flags |= IEEE80211_TX_RC_USE_SHORT_PREAMBLE; /* set up G protection */ if (!(rates[i].flags & IEEE80211_TX_RC_USE_RTS_CTS) && info->control.use_cts_prot && rate->flags & IEEE80211_RATE_ERP_G) rates[i].flags |= IEEE80211_TX_RC_USE_CTS_PROTECT; } } static void rate_control_fill_sta_table(struct ieee80211_sta *sta, struct ieee80211_tx_info *info, struct ieee80211_tx_rate *rates, int max_rates) { struct ieee80211_sta_rates *ratetbl = NULL; int i; if (sta && !info->control.skip_table) ratetbl = rcu_dereference(sta->rates); /* Fill remaining rate slots with data from the sta rate table. */ max_rates = min_t(int, max_rates, IEEE80211_TX_RATE_TABLE_SIZE); for (i = 0; i < max_rates; i++) { if (i < ARRAY_SIZE(info->control.rates) && info->control.rates[i].idx >= 0 && info->control.rates[i].count) { if (rates != info->control.rates) rates[i] = info->control.rates[i]; } else if (ratetbl) { rates[i].idx = ratetbl->rate[i].idx; rates[i].flags = ratetbl->rate[i].flags; if (info->control.use_rts) rates[i].count = ratetbl->rate[i].count_rts; else if (info->control.use_cts_prot) rates[i].count = ratetbl->rate[i].count_cts; else rates[i].count = ratetbl->rate[i].count; } else { rates[i].idx = -1; rates[i].count = 0; } if (rates[i].idx < 0 || !rates[i].count) break; } } static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta, u32 *mask, u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN], u16 vht_mask[NL80211_VHT_NSS_MAX]) { u32 i, flags; *mask = sdata->rc_rateidx_mask[sband->band]; flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef); for (i = 0; i < sband->n_bitrates; i++) { if ((flags & sband->bitrates[i].flags) != flags) *mask &= ~BIT(i); } if (*mask == (1 << sband->n_bitrates) - 1 && !sdata->rc_has_mcs_mask[sband->band] && !sdata->rc_has_vht_mcs_mask[sband->band]) return false; if (sdata->rc_has_mcs_mask[sband->band]) memcpy(mcs_mask, sdata->rc_rateidx_mcs_mask[sband->band], IEEE80211_HT_MCS_MASK_LEN); else memset(mcs_mask, 0xff, IEEE80211_HT_MCS_MASK_LEN); if (sdata->rc_has_vht_mcs_mask[sband->band]) memcpy(vht_mask, sdata->rc_rateidx_vht_mcs_mask[sband->band], sizeof(u16) * NL80211_VHT_NSS_MAX); else memset(vht_mask, 0xff, sizeof(u16) * NL80211_VHT_NSS_MAX); if (sta) { __le16 sta_vht_cap; u16 sta_vht_mask[NL80211_VHT_NSS_MAX]; /* Filter out rates that the STA does not support */ *mask &= sta->deflink.supp_rates[sband->band]; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) mcs_mask[i] &= sta->deflink.ht_cap.mcs.rx_mask[i]; sta_vht_cap = sta->deflink.vht_cap.vht_mcs.rx_mcs_map; ieee80211_get_vht_mask_from_cap(sta_vht_cap, sta_vht_mask); for (i = 0; i < NL80211_VHT_NSS_MAX; i++) vht_mask[i] &= sta_vht_mask[i]; } return true; } static void rate_control_apply_mask_ratetbl(struct sta_info *sta, struct ieee80211_supported_band *sband, struct ieee80211_sta_rates *rates) { int i; u32 mask; u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]; u16 vht_mask[NL80211_VHT_NSS_MAX]; enum nl80211_chan_width chan_width; if (!rate_control_cap_mask(sta->sdata, sband, &sta->sta, &mask, mcs_mask, vht_mask)) return; chan_width = sta->sdata->vif.bss_conf.chandef.width; for (i = 0; i < IEEE80211_TX_RATE_TABLE_SIZE; i++) { if (rates->rate[i].idx < 0) break; rate_idx_match_mask(&rates->rate[i].idx, &rates->rate[i].flags, sband, chan_width, mask, mcs_mask, vht_mask); } } static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_supported_band *sband, struct ieee80211_tx_rate *rates, int max_rates) { enum nl80211_chan_width chan_width; u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]; u32 mask; u16 rate_flags, vht_mask[NL80211_VHT_NSS_MAX]; int i; /* * Try to enforce the rateidx mask the user wanted. skip this if the * default mask (allow all rates) is used to save some processing for * the common case. */ if (!rate_control_cap_mask(sdata, sband, sta, &mask, mcs_mask, vht_mask)) return; /* * Make sure the rate index selected for each TX rate is * included in the configured mask and change the rate indexes * if needed. */ chan_width = sdata->vif.bss_conf.chandef.width; for (i = 0; i < max_rates; i++) { /* Skip invalid rates */ if (rates[i].idx < 0) break; rate_flags = rates[i].flags; rate_idx_match_mask(&rates[i].idx, &rate_flags, sband, chan_width, mask, mcs_mask, vht_mask); rates[i].flags = rate_flags; } } void ieee80211_get_tx_rates(struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct sk_buff *skb, struct ieee80211_tx_rate *dest, int max_rates) { struct ieee80211_sub_if_data *sdata; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_supported_band *sband; rate_control_fill_sta_table(sta, info, dest, max_rates); if (!vif) return; sdata = vif_to_sdata(vif); sband = sdata->local->hw.wiphy->bands[info->band]; if (ieee80211_is_tx_data(skb)) rate_control_apply_mask(sdata, sta, sband, dest, max_rates); if (dest[0].idx < 0) __rate_control_send_low(&sdata->local->hw, sband, sta, info, sdata->rc_rateidx_mask[info->band]); if (sta) rate_fixup_ratelist(vif, sband, info, dest, max_rates); } EXPORT_SYMBOL(ieee80211_get_tx_rates); void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_tx_rate_control *txrc) { struct rate_control_ref *ref = sdata->local->rate_ctrl; void *priv_sta = NULL; struct ieee80211_sta *ista = NULL; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); int i; for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { info->control.rates[i].idx = -1; info->control.rates[i].flags = 0; info->control.rates[i].count = 0; } if (rate_control_send_low(sta ? &sta->sta : NULL, txrc)) return; if (ieee80211_hw_check(&sdata->local->hw, HAS_RATE_CONTROL)) return; if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) { ista = &sta->sta; priv_sta = sta->rate_ctrl_priv; } if (ista) { spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->get_rate(ref->priv, ista, priv_sta, txrc); spin_unlock_bh(&sta->rate_ctrl_lock); } else { rate_control_send_low(NULL, txrc); } if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_RC_TABLE)) return; ieee80211_get_tx_rates(&sdata->vif, ista, txrc->skb, info->control.rates, ARRAY_SIZE(info->control.rates)); } int rate_control_set_rates(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_sta_rates *rates) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_sta_rates *old; struct ieee80211_supported_band *sband; sband = ieee80211_get_sband(sta->sdata); if (!sband) return -EINVAL; rate_control_apply_mask_ratetbl(sta, sband, rates); /* * mac80211 guarantees that this function will not be called * concurrently, so the following RCU access is safe, even without * extra locking. This can not be checked easily, so we just set * the condition to true. */ old = rcu_dereference_protected(pubsta->rates, true); rcu_assign_pointer(pubsta->rates, rates); if (old) kfree_rcu(old, rcu_head); if (sta->uploaded) drv_sta_rate_tbl_update(hw_to_local(hw), sta->sdata, pubsta); ieee80211_sta_set_expected_throughput(pubsta, sta_get_expected_throughput(sta)); return 0; } EXPORT_SYMBOL(rate_control_set_rates); int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, const char *name) { struct rate_control_ref *ref; ASSERT_RTNL(); if (local->open_count) return -EBUSY; if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { if (WARN_ON(!local->ops->set_rts_threshold)) return -EINVAL; return 0; } ref = rate_control_alloc(name, local); if (!ref) { wiphy_warn(local->hw.wiphy, "Failed to select rate control algorithm\n"); return -ENOENT; } WARN_ON(local->rate_ctrl); local->rate_ctrl = ref; wiphy_debug(local->hw.wiphy, "Selected rate control algorithm '%s'\n", ref->ops->name); return 0; } void rate_control_deinitialize(struct ieee80211_local *local) { struct rate_control_ref *ref; ref = local->rate_ctrl; if (!ref) return; local->rate_ctrl = NULL; rate_control_free(local, ref); }
19 285 274 600 599 484 484 485 484 485 485 484 484 485 409 485 486 270 273 273 273 273 16 16 16 16 16 16 16 15 16 274 274 274 274 274 274 274 274 273 273 241 273 273 273 20 20 16 16 169 169 169 4 4 4 4 4 4 35 35 12 35 1335 1335 1335 1335 1335 1334 1334 172 1335 1334 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/file.c - kernfs file implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/uio.h> #include "kernfs-internal.h" struct kernfs_open_node { struct rcu_head rcu_head; atomic_t event; wait_queue_head_t poll; struct list_head files; /* goes through kernfs_open_file.list */ unsigned int nr_mmapped; unsigned int nr_to_release; }; /* * kernfs_notify() may be called from any context and bounces notifications * through a work item. To minimize space overhead in kernfs_node, the * pending queue is implemented as a singly linked list of kernfs_nodes. * The list is terminated with the self pointer so that whether a * kernfs_node is on the list or not can be determined by testing the next * pointer for %NULL. */ #define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) static DEFINE_SPINLOCK(kernfs_notify_lock); static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn) { int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS); return &kernfs_locks->open_file_mutex[idx]; } static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn) { struct mutex *lock; lock = kernfs_open_file_mutex_ptr(kn); mutex_lock(lock); return lock; } /** * of_on - Get the kernfs_open_node of the specified kernfs_open_file * @of: target kernfs_open_file * * Return: the kernfs_open_node of the kernfs_open_file */ static struct kernfs_open_node *of_on(struct kernfs_open_file *of) { return rcu_dereference_protected(of->kn->attr.open, !list_empty(&of->list)); } /** * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn * * @kn: target kernfs_node. * * Fetch and return ->attr.open of @kn when caller holds the * kernfs_open_file_mutex_ptr(kn). * * Update of ->attr.open happens under kernfs_open_file_mutex_ptr(kn). So when * the caller guarantees that this mutex is being held, other updaters can't * change ->attr.open and this means that we can safely deref ->attr.open * outside RCU read-side critical section. * * The caller needs to make sure that kernfs_open_file_mutex is held. * * Return: @kn->attr.open when kernfs_open_file_mutex is held. */ static struct kernfs_open_node * kernfs_deref_open_node_locked(struct kernfs_node *kn) { return rcu_dereference_protected(kn->attr.open, lockdep_is_held(kernfs_open_file_mutex_ptr(kn))); } static struct kernfs_open_file *kernfs_of(struct file *file) { return ((struct seq_file *)file->private_data)->private; } /* * Determine the kernfs_ops for the given kernfs_node. This function must * be called while holding an active reference. */ static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn) { if (kn->flags & KERNFS_LOCKDEP) lockdep_assert_held(kn); return kn->attr.ops; } /* * As kernfs_seq_stop() is also called after kernfs_seq_start() or * kernfs_seq_next() failure, it needs to distinguish whether it's stopping * a seq_file iteration which is fully initialized with an active reference * or an aborted kernfs_seq_start() due to get_active failure. The * position pointer is the only context for each seq_file iteration and * thus the stop condition should be encoded in it. As the return value is * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable * choice to indicate get_active failure. * * Unfortunately, this is complicated due to the optional custom seq_file * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop() * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or * custom seq_file operations and thus can't decide whether put_active * should be performed or not only on ERR_PTR(-ENODEV). * * This is worked around by factoring out the custom seq_stop() and * put_active part into kernfs_seq_stop_active(), skipping it from * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures * that kernfs_seq_stop_active() is skipped only after get_active failure. */ static void kernfs_seq_stop_active(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_stop) ops->seq_stop(sf, v); kernfs_put_active(of->kn); } static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops; /* * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) return ERR_PTR(-ENODEV); ops = kernfs_ops(of->kn); if (ops->seq_start) { void *next = ops->seq_start(sf, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } return single_start(sf, ppos); } static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_next) { void *next = ops->seq_next(sf, v, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } else { /* * The same behavior and code as single_open(), always * terminate after the initial read. */ ++*ppos; return NULL; } } static void kernfs_seq_stop(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; if (v != ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, v); mutex_unlock(&of->mutex); } static int kernfs_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; of->event = atomic_read(&of_on(of)->event); return of->kn->attr.ops->seq_show(sf, v); } static const struct seq_operations kernfs_seq_ops = { .start = kernfs_seq_start, .next = kernfs_seq_next, .stop = kernfs_seq_stop, .show = kernfs_seq_show, }; /* * As reading a bin file can have side-effects, the exact offset and bytes * specified in read(2) call should be passed to the read callback making * it difficult to use seq_file. Implement simplistic custom buffering for * bin files. */ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE); const struct kernfs_ops *ops; char *buf; buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { len = -ENODEV; mutex_unlock(&of->mutex); goto out_free; } of->event = atomic_read(&of_on(of)->event); ops = kernfs_ops(of->kn); if (ops->read) len = ops->read(of, buf, len, iocb->ki_pos); else len = -EINVAL; kernfs_put_active(of->kn); mutex_unlock(&of->mutex); if (len < 0) goto out_free; if (copy_to_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter) { if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW) return seq_read_iter(iocb, iter); return kernfs_file_read_iter(iocb, iter); } /* * Copy data in from userland and pass it to the matching kernfs write * operation. * * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come on * the first write. Hint: if you're writing a value, first read the file, * modify only the value you're changing, then write entire buffer * back. */ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) { struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = iov_iter_count(iter); const struct kernfs_ops *ops; char *buf; if (of->atomic_write_len) { if (len > of->atomic_write_len) return -E2BIG; } else { len = min_t(size_t, len, PAGE_SIZE); } buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; if (copy_from_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } buf[len] = '\0'; /* guarantee string termination */ /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { mutex_unlock(&of->mutex); len = -ENODEV; goto out_free; } ops = kernfs_ops(of->kn); if (ops->write) len = ops->write(of, buf, len, iocb->ki_pos); else len = -EINVAL; kernfs_put_active(of->kn); mutex_unlock(&of->mutex); if (len > 0) iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } static void kernfs_vma_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); if (!of->vm_ops) return; if (!kernfs_get_active(of->kn)) return; if (of->vm_ops->open) of->vm_ops->open(vma); kernfs_put_active(of->kn); } static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active(of->kn)) return VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS; if (of->vm_ops->fault) ret = of->vm_ops->fault(vmf); kernfs_put_active(of->kn); return ret; } static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active(of->kn)) return VM_FAULT_SIGBUS; ret = 0; if (of->vm_ops->page_mkwrite) ret = of->vm_ops->page_mkwrite(vmf); else file_update_time(file); kernfs_put_active(of->kn); return ret; } static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); int ret; if (!of->vm_ops) return -EINVAL; if (!kernfs_get_active(of->kn)) return -EINVAL; ret = -EINVAL; if (of->vm_ops->access) ret = of->vm_ops->access(vma, addr, buf, len, write); kernfs_put_active(of->kn); return ret; } #ifdef CONFIG_NUMA static int kernfs_vma_set_policy(struct vm_area_struct *vma, struct mempolicy *new) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); int ret; if (!of->vm_ops) return 0; if (!kernfs_get_active(of->kn)) return -EINVAL; ret = 0; if (of->vm_ops->set_policy) ret = of->vm_ops->set_policy(vma, new); kernfs_put_active(of->kn); return ret; } static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, unsigned long addr) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); struct mempolicy *pol; if (!of->vm_ops) return vma->vm_policy; if (!kernfs_get_active(of->kn)) return vma->vm_policy; pol = vma->vm_policy; if (of->vm_ops->get_policy) pol = of->vm_ops->get_policy(vma, addr); kernfs_put_active(of->kn); return pol; } #endif static const struct vm_operations_struct kernfs_vm_ops = { .open = kernfs_vma_open, .fault = kernfs_vma_fault, .page_mkwrite = kernfs_vma_page_mkwrite, .access = kernfs_vma_access, #ifdef CONFIG_NUMA .set_policy = kernfs_vma_set_policy, .get_policy = kernfs_vma_get_policy, #endif }; static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) { struct kernfs_open_file *of = kernfs_of(file); const struct kernfs_ops *ops; int rc; /* * mmap path and of->mutex are prone to triggering spurious lockdep * warnings and we don't want to add spurious locking dependency * between the two. Check whether mmap is actually implemented * without grabbing @of->mutex by testing HAS_MMAP flag. See the * comment in kernfs_file_open() for more details. */ if (!(of->kn->flags & KERNFS_HAS_MMAP)) return -ENODEV; mutex_lock(&of->mutex); rc = -ENODEV; if (!kernfs_get_active(of->kn)) goto out_unlock; ops = kernfs_ops(of->kn); rc = ops->mmap(of, vma); if (rc) goto out_put; /* * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() * to satisfy versions of X which crash if the mmap fails: that * substitutes a new vm_file, and we don't then want bin_vm_ops. */ if (vma->vm_file != file) goto out_put; rc = -EINVAL; if (of->mmapped && of->vm_ops != vma->vm_ops) goto out_put; /* * It is not possible to successfully wrap close. * So error if someone is trying to use close. */ if (vma->vm_ops && vma->vm_ops->close) goto out_put; rc = 0; of->mmapped = true; of_on(of)->nr_mmapped++; of->vm_ops = vma->vm_ops; vma->vm_ops = &kernfs_vm_ops; out_put: kernfs_put_active(of->kn); out_unlock: mutex_unlock(&of->mutex); return rc; } /** * kernfs_get_open_node - get or create kernfs_open_node * @kn: target kernfs_node * @of: kernfs_open_file for this instance of open * * If @kn->attr.open exists, increment its reference count; otherwise, * create one. @of is chained to the files list. * * Locking: * Kernel thread context (may sleep). * * Return: * %0 on success, -errno on failure. */ static int kernfs_get_open_node(struct kernfs_node *kn, struct kernfs_open_file *of) { struct kernfs_open_node *on; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { /* not there, initialize a new one */ on = kzalloc(sizeof(*on), GFP_KERNEL); if (!on) { mutex_unlock(mutex); return -ENOMEM; } atomic_set(&on->event, 1); init_waitqueue_head(&on->poll); INIT_LIST_HEAD(&on->files); rcu_assign_pointer(kn->attr.open, on); } list_add_tail(&of->list, &on->files); if (kn->flags & KERNFS_HAS_RELEASE) on->nr_to_release++; mutex_unlock(mutex); return 0; } /** * kernfs_unlink_open_file - Unlink @of from @kn. * * @kn: target kernfs_node * @of: associated kernfs_open_file * @open_failed: ->open() failed, cancel ->release() * * Unlink @of from list of @kn's associated open files. If list of * associated open files becomes empty, disassociate and free * kernfs_open_node. * * LOCKING: * None. */ static void kernfs_unlink_open_file(struct kernfs_node *kn, struct kernfs_open_file *of, bool open_failed) { struct kernfs_open_node *on; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } if (of) { if (kn->flags & KERNFS_HAS_RELEASE) { WARN_ON_ONCE(of->released == open_failed); if (open_failed) on->nr_to_release--; } if (of->mmapped) on->nr_mmapped--; list_del(&of->list); } if (list_empty(&on->files)) { rcu_assign_pointer(kn->attr.open, NULL); kfree_rcu(on, rcu_head); } mutex_unlock(mutex); } static int kernfs_fop_open(struct inode *inode, struct file *file) { struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; bool has_read, has_write, has_mmap; int error = -EACCES; if (!kernfs_get_active(kn)) return -ENODEV; ops = kernfs_ops(kn); has_read = ops->seq_show || ops->read || ops->mmap; has_write = ops->write || ops->mmap; has_mmap = ops->mmap; /* see the flag definition for details */ if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { if ((file->f_mode & FMODE_WRITE) && (!(inode->i_mode & S_IWUGO) || !has_write)) goto err_out; if ((file->f_mode & FMODE_READ) && (!(inode->i_mode & S_IRUGO) || !has_read)) goto err_out; } /* allocate a kernfs_open_file for the file */ error = -ENOMEM; of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL); if (!of) goto err_out; /* * The following is done to give a different lockdep key to * @of->mutex for files which implement mmap. This is a rather * crude way to avoid false positive lockdep warning around * mm->mmap_lock - mmap nests @of->mutex under mm->mmap_lock and * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under * which mm->mmap_lock nests, while holding @of->mutex. As each * open file has a separate mutex, it's okay as long as those don't * happen on the same file. At this point, we can't easily give * each file a separate locking class. Let's differentiate on * whether the file has mmap or not for now. * * Both paths of the branch look the same. They're supposed to * look that way and give @of->mutex different static lockdep keys. */ if (has_mmap) mutex_init(&of->mutex); else mutex_init(&of->mutex); of->kn = kn; of->file = file; /* * Write path needs to atomic_write_len outside active reference. * Cache it in open_file. See kernfs_fop_write_iter() for details. */ of->atomic_write_len = ops->atomic_write_len; error = -EINVAL; /* * ->seq_show is incompatible with ->prealloc, * as seq_read does its own allocation. * ->read must be used instead. */ if (ops->prealloc && ops->seq_show) goto err_free; if (ops->prealloc) { int len = of->atomic_write_len ?: PAGE_SIZE; of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL); error = -ENOMEM; if (!of->prealloc_buf) goto err_free; mutex_init(&of->prealloc_mutex); } /* * Always instantiate seq_file even if read access doesn't use * seq_file or is not requested. This unifies private data access * and readable regular files are the vast majority anyway. */ if (ops->seq_show) error = seq_open(file, &kernfs_seq_ops); else error = seq_open(file, NULL); if (error) goto err_free; of->seq_file = file->private_data; of->seq_file->private = of; /* seq_file clears PWRITE unconditionally, restore it if WRITE */ if (file->f_mode & FMODE_WRITE) file->f_mode |= FMODE_PWRITE; /* make sure we have open node struct */ error = kernfs_get_open_node(kn, of); if (error) goto err_seq_release; if (ops->open) { /* nobody has access to @of yet, skip @of->mutex */ error = ops->open(of); if (error) goto err_put_node; } /* open succeeded, put active references */ kernfs_put_active(kn); return 0; err_put_node: kernfs_unlink_open_file(kn, of, true); err_seq_release: seq_release(inode, file); err_free: kfree(of->prealloc_buf); kfree(of); err_out: kernfs_put_active(kn); return error; } /* used from release/drain to ensure that ->release() is called exactly once */ static void kernfs_release_file(struct kernfs_node *kn, struct kernfs_open_file *of) { /* * @of is guaranteed to have no other file operations in flight and * we just want to synchronize release and drain paths. * @kernfs_open_file_mutex_ptr(kn) is enough. @of->mutex can't be used * here because drain path may be called from places which can * cause circular dependency. */ lockdep_assert_held(kernfs_open_file_mutex_ptr(kn)); if (!of->released) { /* * A file is never detached without being released and we * need to be able to release files which are deactivated * and being drained. Don't use kernfs_ops(). */ kn->attr.ops->release(of); of->released = true; of_on(of)->nr_to_release--; } } static int kernfs_fop_release(struct inode *inode, struct file *filp) { struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); if (kn->flags & KERNFS_HAS_RELEASE) { struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); kernfs_release_file(kn, of); mutex_unlock(mutex); } kernfs_unlink_open_file(kn, of, false); seq_release(inode, filp); kfree(of->prealloc_buf); kfree(of); return 0; } bool kernfs_should_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; bool ret; /* * @kn being deactivated guarantees that @kn->attr.open can't change * beneath us making the lockless test below safe. */ WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); rcu_read_lock(); on = rcu_dereference(kn->attr.open); ret = on && (on->nr_mmapped || on->nr_to_release); rcu_read_unlock(); return ret; } void kernfs_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; struct kernfs_open_file *of; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); if (of->mmapped) { unmap_mapping_range(inode->i_mapping, 0, 0, 1); of->mmapped = false; on->nr_mmapped--; } if (kn->flags & KERNFS_HAS_RELEASE) kernfs_release_file(kn, of); } WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release); mutex_unlock(mutex); } /* * Kernfs attribute files are pollable. The idea is that you read * the content and then you use 'poll' or 'select' to wait for * the content to change. When the content changes (assuming the * manager for the kobject supports notification), poll will * return EPOLLERR|EPOLLPRI, and select will return the fd whether * it is waiting for read, write, or exceptions. * Once poll/select indicates that the value has changed, you * need to close and re-open the file, or seek to 0 and read again. * Reminder: this only works for attributes which actively support * it, and it is not possible to test an attribute from userspace * to see if it supports poll (Neither 'poll' nor 'select' return * an appropriate error code). When in doubt, set a suitable timeout value. */ __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) { struct kernfs_open_node *on = of_on(of); poll_wait(of->file, &on->poll, wait); if (of->event != atomic_read(&on->event)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; return DEFAULT_POLLMASK; } static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); __poll_t ret; if (!kernfs_get_active(kn)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; if (kn->attr.ops->poll) ret = kn->attr.ops->poll(of, wait); else ret = kernfs_generic_poll(of, wait); kernfs_put_active(kn); return ret; } static void kernfs_notify_workfn(struct work_struct *work) { struct kernfs_node *kn; struct kernfs_super_info *info; struct kernfs_root *root; repeat: /* pop one off the notify_list */ spin_lock_irq(&kernfs_notify_lock); kn = kernfs_notify_list; if (kn == KERNFS_NOTIFY_EOL) { spin_unlock_irq(&kernfs_notify_lock); return; } kernfs_notify_list = kn->attr.notify_next; kn->attr.notify_next = NULL; spin_unlock_irq(&kernfs_notify_lock); root = kernfs_root(kn); /* kick fsnotify */ down_read(&root->kernfs_supers_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; struct inode *p_inode = NULL; struct inode *inode; struct qstr name; /* * We want fsnotify_modify() on @kn but as the * modifications aren't originating from userland don't * have the matching @file available. Look up the inodes * and generate the events manually. */ inode = ilookup(info->sb, kernfs_ino(kn)); if (!inode) continue; name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name)); parent = kernfs_get_parent(kn); if (parent) { p_inode = ilookup(info->sb, kernfs_ino(parent)); if (p_inode) { fsnotify(FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, p_inode, &name, inode, 0); iput(p_inode); } kernfs_put(parent); } if (!p_inode) fsnotify_inode(inode, FS_MODIFY); iput(inode); } up_read(&root->kernfs_supers_rwsem); kernfs_put(kn); goto repeat; } /** * kernfs_notify - notify a kernfs file * @kn: file to notify * * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any * context. */ void kernfs_notify(struct kernfs_node *kn) { static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); unsigned long flags; struct kernfs_open_node *on; if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) return; /* kick poll immediately */ rcu_read_lock(); on = rcu_dereference(kn->attr.open); if (on) { atomic_inc(&on->event); wake_up_interruptible(&on->poll); } rcu_read_unlock(); /* schedule work to kick fsnotify */ spin_lock_irqsave(&kernfs_notify_lock, flags); if (!kn->attr.notify_next) { kernfs_get(kn); kn->attr.notify_next = kernfs_notify_list; kernfs_notify_list = kn; schedule_work(&kernfs_notify_work); } spin_unlock_irqrestore(&kernfs_notify_lock, flags); } EXPORT_SYMBOL_GPL(kernfs_notify); const struct file_operations kernfs_file_fops = { .read_iter = kernfs_fop_read_iter, .write_iter = kernfs_fop_write_iter, .llseek = generic_file_llseek, .mmap = kernfs_fop_mmap, .open = kernfs_fop_open, .release = kernfs_fop_release, .poll = kernfs_fop_poll, .fsync = noop_fsync, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, }; /** * __kernfs_create_file - kernfs internal function to create a file * @parent: directory to create the file in * @name: name of the file * @mode: mode of the file * @uid: uid of the file * @gid: gid of the file * @size: size of the file * @ops: kernfs operations for the file * @priv: private data for the file * @ns: optional namespace tag of the file * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * * Return: the created node on success, ERR_PTR() value on error. */ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { struct kernfs_node *kn; unsigned flags; int rc; flags = KERNFS_FILE; kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, uid, gid, flags); if (!kn) return ERR_PTR(-ENOMEM); kn->attr.ops = ops; kn->attr.size = size; kn->ns = ns; kn->priv = priv; #ifdef CONFIG_DEBUG_LOCK_ALLOC if (key) { lockdep_init_map(&kn->dep_map, "kn->active", key, 0); kn->flags |= KERNFS_LOCKDEP; } #endif /* * kn->attr.ops is accessible only while holding active ref. We * need to know whether some ops are implemented outside active * ref. Cache their existence in flags. */ if (ops->seq_show) kn->flags |= KERNFS_HAS_SEQ_SHOW; if (ops->mmap) kn->flags |= KERNFS_HAS_MMAP; if (ops->release) kn->flags |= KERNFS_HAS_RELEASE; rc = kernfs_add_one(kn); if (rc) { kernfs_put(kn); return ERR_PTR(rc); } return kn; }
4701 4703 4700 4701 376 376 376 4630 3400 4630 4630 4630 4629 4629 346 4447 1 1 1 64 64 64 33 64 33 64 64 64 64 171 171 170 171 1351 1351 1351 1351 1351 1351 26 26 1 1 1346 1230 1230 114 126 105 41 41 126 1351 1351 1351 1350 200 200 1 1 1 199 1 200 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 // SPDX-License-Identifier: GPL-2.0-only #include <linux/bitmap.h> #include <linux/bug.h> #include <linux/export.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/xarray.h> /** * idr_alloc_u32() - Allocate an ID. * @idr: IDR handle. * @ptr: Pointer to be associated with the new ID. * @nextid: Pointer to an ID. * @max: The maximum ID to allocate (inclusive). * @gfp: Memory allocation flags. * * Allocates an unused ID in the range specified by @nextid and @max. * Note that @max is inclusive whereas the @end parameter to idr_alloc() * is exclusive. The new ID is assigned to @nextid before the pointer * is inserted into the IDR, so if @nextid points into the object pointed * to by @ptr, a concurrent lookup will not find an uninitialised ID. * * The caller should provide their own locking to ensure that two * concurrent modifications to the IDR are not possible. Read-only * accesses to the IDR may be done under the RCU read lock or may * exclude simultaneous writers. * * Return: 0 if an ID was allocated, -ENOMEM if memory allocation failed, * or -ENOSPC if no free IDs could be found. If an error occurred, * @nextid is unchanged. */ int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid, unsigned long max, gfp_t gfp) { struct radix_tree_iter iter; void __rcu **slot; unsigned int base = idr->idr_base; unsigned int id = *nextid; if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR))) idr->idr_rt.xa_flags |= IDR_RT_MARKER; id = (id < base) ? 0 : id - base; radix_tree_iter_init(&iter, id); slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base); if (IS_ERR(slot)) return PTR_ERR(slot); *nextid = iter.index + base; /* there is a memory barrier inside radix_tree_iter_replace() */ radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr); radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE); return 0; } EXPORT_SYMBOL_GPL(idr_alloc_u32); /** * idr_alloc() - Allocate an ID. * @idr: IDR handle. * @ptr: Pointer to be associated with the new ID. * @start: The minimum ID (inclusive). * @end: The maximum ID (exclusive). * @gfp: Memory allocation flags. * * Allocates an unused ID in the range specified by @start and @end. If * @end is <= 0, it is treated as one larger than %INT_MAX. This allows * callers to use @start + N as @end as long as N is within integer range. * * The caller should provide their own locking to ensure that two * concurrent modifications to the IDR are not possible. Read-only * accesses to the IDR may be done under the RCU read lock or may * exclude simultaneous writers. * * Return: The newly allocated ID, -ENOMEM if memory allocation failed, * or -ENOSPC if no free IDs could be found. */ int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp) { u32 id = start; int ret; if (WARN_ON_ONCE(start < 0)) return -EINVAL; ret = idr_alloc_u32(idr, ptr, &id, end > 0 ? end - 1 : INT_MAX, gfp); if (ret) return ret; return id; } EXPORT_SYMBOL_GPL(idr_alloc); /** * idr_alloc_cyclic() - Allocate an ID cyclically. * @idr: IDR handle. * @ptr: Pointer to be associated with the new ID. * @start: The minimum ID (inclusive). * @end: The maximum ID (exclusive). * @gfp: Memory allocation flags. * * Allocates an unused ID in the range specified by @start and @end. If * @end is <= 0, it is treated as one larger than %INT_MAX. This allows * callers to use @start + N as @end as long as N is within integer range. * The search for an unused ID will start at the last ID allocated and will * wrap around to @start if no free IDs are found before reaching @end. * * The caller should provide their own locking to ensure that two * concurrent modifications to the IDR are not possible. Read-only * accesses to the IDR may be done under the RCU read lock or may * exclude simultaneous writers. * * Return: The newly allocated ID, -ENOMEM if memory allocation failed, * or -ENOSPC if no free IDs could be found. */ int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp) { u32 id = idr->idr_next; int err, max = end > 0 ? end - 1 : INT_MAX; if ((int)id < start) id = start; err = idr_alloc_u32(idr, ptr, &id, max, gfp); if ((err == -ENOSPC) && (id > start)) { id = start; err = idr_alloc_u32(idr, ptr, &id, max, gfp); } if (err) return err; idr->idr_next = id + 1; return id; } EXPORT_SYMBOL(idr_alloc_cyclic); /** * idr_remove() - Remove an ID from the IDR. * @idr: IDR handle. * @id: Pointer ID. * * Removes this ID from the IDR. If the ID was not previously in the IDR, * this function returns %NULL. * * Since this function modifies the IDR, the caller should provide their * own locking to ensure that concurrent modification of the same IDR is * not possible. * * Return: The pointer formerly associated with this ID. */ void *idr_remove(struct idr *idr, unsigned long id) { return radix_tree_delete_item(&idr->idr_rt, id - idr->idr_base, NULL); } EXPORT_SYMBOL_GPL(idr_remove); /** * idr_find() - Return pointer for given ID. * @idr: IDR handle. * @id: Pointer ID. * * Looks up the pointer associated with this ID. A %NULL pointer may * indicate that @id is not allocated or that the %NULL pointer was * associated with this ID. * * This function can be called under rcu_read_lock(), given that the leaf * pointers lifetimes are correctly managed. * * Return: The pointer associated with this ID. */ void *idr_find(const struct idr *idr, unsigned long id) { return radix_tree_lookup(&idr->idr_rt, id - idr->idr_base); } EXPORT_SYMBOL_GPL(idr_find); /** * idr_for_each() - Iterate through all stored pointers. * @idr: IDR handle. * @fn: Function to be called for each pointer. * @data: Data passed to callback function. * * The callback function will be called for each entry in @idr, passing * the ID, the entry and @data. * * If @fn returns anything other than %0, the iteration stops and that * value is returned from this function. * * idr_for_each() can be called concurrently with idr_alloc() and * idr_remove() if protected by RCU. Newly added entries may not be * seen and deleted entries may be seen, but adding and removing entries * will not cause other entries to be skipped, nor spurious ones to be seen. */ int idr_for_each(const struct idr *idr, int (*fn)(int id, void *p, void *data), void *data) { struct radix_tree_iter iter; void __rcu **slot; int base = idr->idr_base; radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, 0) { int ret; unsigned long id = iter.index + base; if (WARN_ON_ONCE(id > INT_MAX)) break; ret = fn(id, rcu_dereference_raw(*slot), data); if (ret) return ret; } return 0; } EXPORT_SYMBOL(idr_for_each); /** * idr_get_next_ul() - Find next populated entry. * @idr: IDR handle. * @nextid: Pointer to an ID. * * Returns the next populated entry in the tree with an ID greater than * or equal to the value pointed to by @nextid. On exit, @nextid is updated * to the ID of the found value. To use in a loop, the value pointed to by * nextid must be incremented by the user. */ void *idr_get_next_ul(struct idr *idr, unsigned long *nextid) { struct radix_tree_iter iter; void __rcu **slot; void *entry = NULL; unsigned long base = idr->idr_base; unsigned long id = *nextid; id = (id < base) ? 0 : id - base; radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, id) { entry = rcu_dereference_raw(*slot); if (!entry) continue; if (!xa_is_internal(entry)) break; if (slot != &idr->idr_rt.xa_head && !xa_is_retry(entry)) break; slot = radix_tree_iter_retry(&iter); } if (!slot) return NULL; *nextid = iter.index + base; return entry; } EXPORT_SYMBOL(idr_get_next_ul); /** * idr_get_next() - Find next populated entry. * @idr: IDR handle. * @nextid: Pointer to an ID. * * Returns the next populated entry in the tree with an ID greater than * or equal to the value pointed to by @nextid. On exit, @nextid is updated * to the ID of the found value. To use in a loop, the value pointed to by * nextid must be incremented by the user. */ void *idr_get_next(struct idr *idr, int *nextid) { unsigned long id = *nextid; void *entry = idr_get_next_ul(idr, &id); if (WARN_ON_ONCE(id > INT_MAX)) return NULL; *nextid = id; return entry; } EXPORT_SYMBOL(idr_get_next); /** * idr_replace() - replace pointer for given ID. * @idr: IDR handle. * @ptr: New pointer to associate with the ID. * @id: ID to change. * * Replace the pointer registered with an ID and return the old value. * This function can be called under the RCU read lock concurrently with * idr_alloc() and idr_remove() (as long as the ID being removed is not * the one being replaced!). * * Returns: the old value on success. %-ENOENT indicates that @id was not * found. %-EINVAL indicates that @ptr was not valid. */ void *idr_replace(struct idr *idr, void *ptr, unsigned long id) { struct radix_tree_node *node; void __rcu **slot = NULL; void *entry; id -= idr->idr_base; entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot); if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE)) return ERR_PTR(-ENOENT); __radix_tree_replace(&idr->idr_rt, node, slot, ptr); return entry; } EXPORT_SYMBOL(idr_replace); /** * DOC: IDA description * * The IDA is an ID allocator which does not provide the ability to * associate an ID with a pointer. As such, it only needs to store one * bit per ID, and so is more space efficient than an IDR. To use an IDA, * define it using DEFINE_IDA() (or embed a &struct ida in a data structure, * then initialise it using ida_init()). To allocate a new ID, call * ida_alloc(), ida_alloc_min(), ida_alloc_max() or ida_alloc_range(). * To free an ID, call ida_free(). * * ida_destroy() can be used to dispose of an IDA without needing to * free the individual IDs in it. You can use ida_is_empty() to find * out whether the IDA has any IDs currently allocated. * * The IDA handles its own locking. It is safe to call any of the IDA * functions without synchronisation in your code. * * IDs are currently limited to the range [0-INT_MAX]. If this is an awkward * limitation, it should be quite straightforward to raise the maximum. */ /* * Developer's notes: * * The IDA uses the functionality provided by the XArray to store bitmaps in * each entry. The XA_FREE_MARK is only cleared when all bits in the bitmap * have been set. * * I considered telling the XArray that each slot is an order-10 node * and indexing by bit number, but the XArray can't allow a single multi-index * entry in the head, which would significantly increase memory consumption * for the IDA. So instead we divide the index by the number of bits in the * leaf bitmap before doing a radix tree lookup. * * As an optimisation, if there are only a few low bits set in any given * leaf, instead of allocating a 128-byte bitmap, we store the bits * as a value entry. Value entries never have the XA_FREE_MARK cleared * because we can always convert them into a bitmap entry. * * It would be possible to optimise further; once we've run out of a * single 128-byte bitmap, we currently switch to a 576-byte node, put * the 128-byte bitmap in the first entry and then start allocating extra * 128-byte entries. We could instead use the 512 bytes of the node's * data as a bitmap before moving to that scheme. I do not believe this * is a worthwhile optimisation; Rasmus Villemoes surveyed the current * users of the IDA and almost none of them use more than 1024 entries. * Those that do use more than the 8192 IDs that the 512 bytes would * provide. * * The IDA always uses a lock to alloc/free. If we add a 'test_bit' * equivalent, it will still need locking. Going to RCU lookup would require * using RCU to free bitmaps, and that's not trivial without embedding an * RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte * bitmap, which is excessive. */ /** * ida_alloc_range() - Allocate an unused ID. * @ida: IDA handle. * @min: Lowest ID to allocate. * @max: Highest ID to allocate. * @gfp: Memory allocation flags. * * Allocate an ID between @min and @max, inclusive. The allocated ID will * not exceed %INT_MAX, even if @max is larger. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max, gfp_t gfp) { XA_STATE(xas, &ida->xa, min / IDA_BITMAP_BITS); unsigned bit = min % IDA_BITMAP_BITS; unsigned long flags; struct ida_bitmap *bitmap, *alloc = NULL; if ((int)min < 0) return -ENOSPC; if ((int)max < 0) max = INT_MAX; retry: xas_lock_irqsave(&xas, flags); next: bitmap = xas_find_marked(&xas, max / IDA_BITMAP_BITS, XA_FREE_MARK); if (xas.xa_index > min / IDA_BITMAP_BITS) bit = 0; if (xas.xa_index * IDA_BITMAP_BITS + bit > max) goto nospc; if (xa_is_value(bitmap)) { unsigned long tmp = xa_to_value(bitmap); if (bit < BITS_PER_XA_VALUE) { bit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, bit); if (xas.xa_index * IDA_BITMAP_BITS + bit > max) goto nospc; if (bit < BITS_PER_XA_VALUE) { tmp |= 1UL << bit; xas_store(&xas, xa_mk_value(tmp)); goto out; } } bitmap = alloc; if (!bitmap) bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT); if (!bitmap) goto alloc; bitmap->bitmap[0] = tmp; xas_store(&xas, bitmap); if (xas_error(&xas)) { bitmap->bitmap[0] = 0; goto out; } } if (bitmap) { bit = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, bit); if (xas.xa_index * IDA_BITMAP_BITS + bit > max) goto nospc; if (bit == IDA_BITMAP_BITS) goto next; __set_bit(bit, bitmap->bitmap); if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS)) xas_clear_mark(&xas, XA_FREE_MARK); } else { if (bit < BITS_PER_XA_VALUE) { bitmap = xa_mk_value(1UL << bit); } else { bitmap = alloc; if (!bitmap) bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT); if (!bitmap) goto alloc; __set_bit(bit, bitmap->bitmap); } xas_store(&xas, bitmap); } out: xas_unlock_irqrestore(&xas, flags); if (xas_nomem(&xas, gfp)) { xas.xa_index = min / IDA_BITMAP_BITS; bit = min % IDA_BITMAP_BITS; goto retry; } if (bitmap != alloc) kfree(alloc); if (xas_error(&xas)) return xas_error(&xas); return xas.xa_index * IDA_BITMAP_BITS + bit; alloc: xas_unlock_irqrestore(&xas, flags); alloc = kzalloc(sizeof(*bitmap), gfp); if (!alloc) return -ENOMEM; xas_set(&xas, min / IDA_BITMAP_BITS); bit = min % IDA_BITMAP_BITS; goto retry; nospc: xas_unlock_irqrestore(&xas, flags); kfree(alloc); return -ENOSPC; } EXPORT_SYMBOL(ida_alloc_range); /** * ida_free() - Release an allocated ID. * @ida: IDA handle. * @id: Previously allocated ID. * * Context: Any context. It is safe to call this function without * locking in your code. */ void ida_free(struct ida *ida, unsigned int id) { XA_STATE(xas, &ida->xa, id / IDA_BITMAP_BITS); unsigned bit = id % IDA_BITMAP_BITS; struct ida_bitmap *bitmap; unsigned long flags; if ((int)id < 0) return; xas_lock_irqsave(&xas, flags); bitmap = xas_load(&xas); if (xa_is_value(bitmap)) { unsigned long v = xa_to_value(bitmap); if (bit >= BITS_PER_XA_VALUE) goto err; if (!(v & (1UL << bit))) goto err; v &= ~(1UL << bit); if (!v) goto delete; xas_store(&xas, xa_mk_value(v)); } else { if (!test_bit(bit, bitmap->bitmap)) goto err; __clear_bit(bit, bitmap->bitmap); xas_set_mark(&xas, XA_FREE_MARK); if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) { kfree(bitmap); delete: xas_store(&xas, NULL); } } xas_unlock_irqrestore(&xas, flags); return; err: xas_unlock_irqrestore(&xas, flags); WARN(1, "ida_free called for id=%d which is not allocated.\n", id); } EXPORT_SYMBOL(ida_free); /** * ida_destroy() - Free all IDs. * @ida: IDA handle. * * Calling this function frees all IDs and releases all resources used * by an IDA. When this call returns, the IDA is empty and can be reused * or freed. If the IDA is already empty, there is no need to call this * function. * * Context: Any context. It is safe to call this function without * locking in your code. */ void ida_destroy(struct ida *ida) { XA_STATE(xas, &ida->xa, 0); struct ida_bitmap *bitmap; unsigned long flags; xas_lock_irqsave(&xas, flags); xas_for_each(&xas, bitmap, ULONG_MAX) { if (!xa_is_value(bitmap)) kfree(bitmap); xas_store(&xas, NULL); } xas_unlock_irqrestore(&xas, flags); } EXPORT_SYMBOL(ida_destroy); #ifndef __KERNEL__ extern void xa_dump_index(unsigned long index, unsigned int shift); #define IDA_CHUNK_SHIFT ilog2(IDA_BITMAP_BITS) static void ida_dump_entry(void *entry, unsigned long index) { unsigned long i; if (!entry) return; if (xa_is_node(entry)) { struct xa_node *node = xa_to_node(entry); unsigned int shift = node->shift + IDA_CHUNK_SHIFT + XA_CHUNK_SHIFT; xa_dump_index(index * IDA_BITMAP_BITS, shift); xa_dump_node(node); for (i = 0; i < XA_CHUNK_SIZE; i++) ida_dump_entry(node->slots[i], index | (i << node->shift)); } else if (xa_is_value(entry)) { xa_dump_index(index * IDA_BITMAP_BITS, ilog2(BITS_PER_LONG)); pr_cont("value: data %lx [%px]\n", xa_to_value(entry), entry); } else { struct ida_bitmap *bitmap = entry; xa_dump_index(index * IDA_BITMAP_BITS, IDA_CHUNK_SHIFT); pr_cont("bitmap: %p data", bitmap); for (i = 0; i < IDA_BITMAP_LONGS; i++) pr_cont(" %lx", bitmap->bitmap[i]); pr_cont("\n"); } } static void ida_dump(struct ida *ida) { struct xarray *xa = &ida->xa; pr_debug("ida: %p node %p free %d\n", ida, xa->xa_head, xa->xa_flags >> ROOT_TAG_SHIFT); ida_dump_entry(xa->xa_head, 0); } #endif
122 122 828 599 597 590 590 591 2806 2806 2806 2806 2805 2806 2805 2805 2320 1527 6163 6163 6163 388 1 6163 1093 2155 2155 1530 1495 1527 23 1 1530 442 1527 2826 2825 2826 2826 863 2825 863 2825 1515 1515 492 492 492 492 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swap.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * This file contains the default values for the operation of the * Linux VM subsystem. Fine-tuning documentation can be found in * Documentation/admin-guide/sysctl/vm.rst. * Started 18.12.91 * Swap aging added 23.2.95, Stephen Tweedie. * Buffermem limits added 12.3.98, Rik van Riel. */ #include <linux/mm.h> #include <linux/sched.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/init.h> #include <linux/export.h> #include <linux/mm_inline.h> #include <linux/percpu_counter.h> #include <linux/memremap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/page_idle.h> #include <linux/local_lock.h> #include <linux/buffer_head.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/pagemap.h> /* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; const int page_cluster_max = 31; /* Protecting only lru_rotate.fbatch which requires disabling interrupts */ struct lru_rotate { local_lock_t lock; struct folio_batch fbatch; }; static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { .lock = INIT_LOCAL_LOCK(lock), }; /* * The following folio batches are grouped together because they are protected * by disabling preemption (and interrupts remain enabled). */ struct cpu_fbatches { local_lock_t lock; struct folio_batch lru_add; struct folio_batch lru_deactivate_file; struct folio_batch lru_deactivate; struct folio_batch lru_lazyfree; #ifdef CONFIG_SMP struct folio_batch activate; #endif }; static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), }; /* * This path almost never happens for VM activity - pages are normally freed * in batches. But it gets used by networking - and for compound pages. */ static void __page_cache_release(struct folio *folio) { if (folio_test_lru(folio)) { struct lruvec *lruvec; unsigned long flags; lruvec = folio_lruvec_lock_irqsave(folio, &flags); lruvec_del_folio(lruvec, folio); __folio_clear_lru_flags(folio); unlock_page_lruvec_irqrestore(lruvec, flags); } /* See comment on folio_test_mlocked in release_pages() */ if (unlikely(folio_test_mlocked(folio))) { long nr_pages = folio_nr_pages(folio); __folio_clear_mlocked(folio); zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); } } static void __folio_put_small(struct folio *folio) { __page_cache_release(folio); mem_cgroup_uncharge(folio); free_unref_page(&folio->page, 0); } static void __folio_put_large(struct folio *folio) { /* * __page_cache_release() is supposed to be called for thp, not for * hugetlb. This is because hugetlb page does never have PageLRU set * (it's never listed to any LRU lists) and no memcg routines should * be called for hugetlb (it has a separate hugetlb_cgroup.) */ if (!folio_test_hugetlb(folio)) __page_cache_release(folio); destroy_large_folio(folio); } void __folio_put(struct folio *folio) { if (unlikely(folio_is_zone_device(folio))) free_zone_device_page(&folio->page); else if (unlikely(folio_test_large(folio))) __folio_put_large(folio); else __folio_put_small(folio); } EXPORT_SYMBOL(__folio_put); /** * put_pages_list() - release a list of pages * @pages: list of pages threaded on page->lru * * Release a list of pages which are strung together on page.lru. */ void put_pages_list(struct list_head *pages) { struct folio *folio, *next; list_for_each_entry_safe(folio, next, pages, lru) { if (!folio_put_testzero(folio)) { list_del(&folio->lru); continue; } if (folio_test_large(folio)) { list_del(&folio->lru); __folio_put_large(folio); continue; } /* LRU flag must be clear because it's passed using the lru */ } free_unref_page_list(pages); INIT_LIST_HEAD(pages); } EXPORT_SYMBOL(put_pages_list); typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) { int was_unevictable = folio_test_clear_unevictable(folio); long nr_pages = folio_nr_pages(folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests the mlocked flag, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think * not, because __munlock_folio() only clears the mlocked flag * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily * true of release_pages(): but those only clear the mlocked flag after * folio_put_testzero() has excluded any other users of the folio.) */ if (folio_evictable(folio)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { folio_clear_active(folio); folio_set_unevictable(folio); /* * folio->mlock_count = !!folio_test_mlocked(folio)? * But that leaves __mlock_folio() in doubt whether another * actor has already counted the mlock or not. Err on the * safe side, underestimate, let page reclaim fix it, rather * than leaving a page on the unevictable LRU indefinitely. */ folio->mlock_count = 0; if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } lruvec_add_folio(lruvec, folio); trace_mm_lru_insertion(folio); } static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) { int i; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; /* block memcg migration while the folio moves between lru */ if (move_fn != lru_add_fn && !folio_test_clear_lru(folio)) continue; lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); move_fn(lruvec, folio); folio_set_lru(folio); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); folios_put(fbatch->folios, folio_batch_count(fbatch)); folio_batch_reinit(fbatch); } static void folio_batch_add_and_move(struct folio_batch *fbatch, struct folio *folio, move_fn_t move_fn) { if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) && !lru_cache_disabled()) return; folio_batch_move_lru(fbatch, move_fn); } static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio) { if (!folio_test_unevictable(folio)) { lruvec_del_folio(lruvec, folio); folio_clear_active(folio); lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, folio_nr_pages(folio)); } } /* * Writeback is about to end against a folio which has been marked for * immediate reclaim. If it still appears to be reclaimable, move it * to the tail of the inactive list. * * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races. */ void folio_rotate_reclaimable(struct folio *folio) { if (!folio_test_locked(folio) && !folio_test_dirty(folio) && !folio_test_unevictable(folio) && folio_test_lru(folio)) { struct folio_batch *fbatch; unsigned long flags; folio_get(folio); local_lock_irqsave(&lru_rotate.lock, flags); fbatch = this_cpu_ptr(&lru_rotate.fbatch); folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } } void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated) { unsigned long cost; /* * Reflect the relative cost of incurring IO and spending CPU * time on rotations. This doesn't attempt to make a precise * comparison, it just says: if reloads are about comparable * between the LRU lists, or rotations are overwhelmingly * different between them, adjust scan balance for CPU work. */ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; do { unsigned long lrusize; /* * Hold lruvec->lru_lock is safe here, since * 1) The pinned lruvec in reclaim, or * 2) From a pre-LRU page during refault (which also holds the * rcu lock, so would be safe even if the page was on the LRU * and could move simultaneously to a new lruvec). */ spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) lruvec->file_cost += cost; else lruvec->anon_cost += cost; /* * Decay previous events * * Because workloads change over time (and to avoid * overflow) we keep these statistics as a floating * average, which ends up weighing recent refaults * more than old ones. */ lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + lruvec_page_state(lruvec, NR_ACTIVE_FILE); if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } spin_unlock_irq(&lruvec->lru_lock); } while ((lruvec = parent_lruvec(lruvec))); } void lru_note_cost_refault(struct folio *folio) { lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), folio_nr_pages(folio), 0); } static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) { if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { long nr_pages = folio_nr_pages(folio); lruvec_del_folio(lruvec, folio); folio_set_active(folio); lruvec_add_folio(lruvec, folio); trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } } #ifdef CONFIG_SMP static void folio_activate_drain(int cpu) { struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu); if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, folio_activate_fn); } void folio_activate(struct folio *folio) { if (folio_test_lru(folio) && !folio_test_active(folio) && !folio_test_unevictable(folio)) { struct folio_batch *fbatch; folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.activate); folio_batch_add_and_move(fbatch, folio, folio_activate_fn); local_unlock(&cpu_fbatches.lock); } } #else static inline void folio_activate_drain(int cpu) { } void folio_activate(struct folio *folio) { struct lruvec *lruvec; if (folio_test_clear_lru(folio)) { lruvec = folio_lruvec_lock_irq(folio); folio_activate_fn(lruvec, folio); unlock_page_lruvec_irq(lruvec); folio_set_lru(folio); } } #endif static void __lru_cache_activate_folio(struct folio *folio) { struct folio_batch *fbatch; int i; local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); /* * Search backwards on the optimistic assumption that the folio being * activated has just been added to this batch. Note that only * the local batch is examined as a !LRU folio could be in the * process of being released, reclaimed, migrated or on a remote * batch that is currently being drained. Furthermore, marking * a remote batch's folio active potentially hits a race where * a folio is marked active just after it is added to the inactive * list causing accounting errors and BUG_ON checks to trigger. */ for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) { struct folio *batch_folio = fbatch->folios[i]; if (batch_folio == folio) { folio_set_active(folio); break; } } local_unlock(&cpu_fbatches.lock); } #ifdef CONFIG_LRU_GEN static void folio_inc_refs(struct folio *folio) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags); if (folio_test_unevictable(folio)) return; if (!folio_test_referenced(folio)) { folio_set_referenced(folio); return; } if (!folio_test_workingset(folio)) { folio_set_workingset(folio); return; } /* see the comment on MAX_NR_TIERS */ do { new_flags = old_flags & LRU_REFS_MASK; if (new_flags == LRU_REFS_MASK) break; new_flags += BIT(LRU_REFS_PGOFF); new_flags |= old_flags & ~LRU_REFS_MASK; } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); } #else static void folio_inc_refs(struct folio *folio) { } #endif /* CONFIG_LRU_GEN */ /* * Mark a page as having seen activity. * * inactive,unreferenced -> inactive,referenced * inactive,referenced -> active,unreferenced * active,unreferenced -> active,referenced * * When a newly allocated page is not yet visible, so safe for non-atomic ops, * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). */ void folio_mark_accessed(struct folio *folio) { if (lru_gen_enabled()) { folio_inc_refs(folio); return; } if (!folio_test_referenced(folio)) { folio_set_referenced(folio); } else if (folio_test_unevictable(folio)) { /* * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, * this list is never rotated or maintained, so marking an * unevictable page accessed has no effect. */ } else if (!folio_test_active(folio)) { /* * If the folio is on the LRU, queue it for activation via * cpu_fbatches.activate. Otherwise, assume the folio is in a * folio_batch, mark it active and it'll be moved to the active * LRU on the next drain. */ if (folio_test_lru(folio)) folio_activate(folio); else __lru_cache_activate_folio(folio); folio_clear_referenced(folio); workingset_activation(folio); } if (folio_test_idle(folio)) folio_clear_idle(folio); } EXPORT_SYMBOL(folio_mark_accessed); /** * folio_add_lru - Add a folio to an LRU list. * @folio: The folio to be added to the LRU. * * Queue the folio for addition to the LRU. The decision on whether * to add the page to the [in]active [file|anon] list is deferred until the * folio_batch is drained. This gives a chance for the caller of folio_add_lru() * have the folio added to the active list using folio_mark_accessed(). */ void folio_add_lru(struct folio *folio) { struct folio_batch *fbatch; VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* see the comment in lru_gen_add_folio() */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); folio_batch_add_and_move(fbatch, folio, lru_add_fn); local_unlock(&cpu_fbatches.lock); } EXPORT_SYMBOL(folio_add_lru); /** * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. * @folio: The folio to be added to the LRU. * @vma: VMA in which the folio is mapped. * * If the VMA is mlocked, @folio is added to the unevictable list. * Otherwise, it is treated the same way as folio_add_lru(). */ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) { VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) mlock_new_folio(folio); else folio_add_lru(folio); } /* * If the folio cannot be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the * head of the list, rather than the tail, to give the flusher * threads some time to write it out, as this is much more * effective than the single-page writeout from reclaim. * * If the folio isn't mapped and dirty/writeback, the folio * could be reclaimed asap using the reclaim flag. * * 1. active, mapped folio -> none * 2. active, dirty/writeback folio -> inactive, head, reclaim * 3. inactive, mapped folio -> none * 4. inactive, dirty/writeback folio -> inactive, head, reclaim * 5. inactive, clean -> inactive, tail * 6. Others -> none * * In 4, it moves to the head of the inactive list so the folio is * written out by flusher threads as this is much more efficient * than the single-page writeout from reclaim. */ static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio) { bool active = folio_test_active(folio); long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio)) return; /* Some processes are using the folio */ if (folio_mapped(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); if (folio_test_writeback(folio) || folio_test_dirty(folio)) { /* * Setting the reclaim flag could race with * folio_end_writeback() and confuse readahead. But the * race window is _really_ small and it's not a critical * problem. */ lruvec_add_folio(lruvec, folio); folio_set_reclaim(folio); } else { /* * The folio's writeback ended while it was in the batch. * We move that folio to the tail of the inactive list. */ lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, nr_pages); } if (active) { __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio) { if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) { long nr_pages = folio_nr_pages(folio); lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio) { if (folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { long nr_pages = folio_nr_pages(folio); lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); /* * Lazyfree folios are clean anonymous folios. They have * the swapbacked flag cleared, to distinguish them from normal * anonymous folios */ folio_clear_swapbacked(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } } /* * Drain pages out of the cpu's folio_batch. * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. */ void lru_add_drain_cpu(int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); struct folio_batch *fbatch = &fbatches->lru_add; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_add_fn); fbatch = &per_cpu(lru_rotate.fbatch, cpu); /* Disabling interrupts below acts as a compiler barrier. */ if (data_race(folio_batch_count(fbatch))) { unsigned long flags; /* No harm done if a racing interrupt already did this */ local_lock_irqsave(&lru_rotate.lock, flags); folio_batch_move_lru(fbatch, lru_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } fbatch = &fbatches->lru_deactivate_file; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_file_fn); fbatch = &fbatches->lru_deactivate; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_fn); fbatch = &fbatches->lru_lazyfree; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_lazyfree_fn); folio_activate_drain(cpu); } /** * deactivate_file_folio() - Deactivate a file folio. * @folio: Folio to deactivate. * * This function hints to the VM that @folio is a good reclaim candidate, * for example if its invalidation fails due to the folio being dirty * or under writeback. * * Context: Caller holds a reference on the folio. */ void deactivate_file_folio(struct folio *folio) { struct folio_batch *fbatch; /* Deactivating an unevictable folio will not accelerate reclaim */ if (folio_test_unevictable(folio)) return; folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file); folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn); local_unlock(&cpu_fbatches.lock); } /* * folio_deactivate - deactivate a folio * @folio: folio to deactivate * * folio_deactivate() moves @folio to the inactive list if @folio was on the * active list and was not unevictable. This is done to accelerate the * reclaim of @folio. */ void folio_deactivate(struct folio *folio) { if (folio_test_lru(folio) && !folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) { struct folio_batch *fbatch; folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate); folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn); local_unlock(&cpu_fbatches.lock); } } /** * folio_mark_lazyfree - make an anon folio lazyfree * @folio: folio to deactivate * * folio_mark_lazyfree() moves @folio to the inactive file list. * This is done to accelerate the reclaim of @folio. */ void folio_mark_lazyfree(struct folio *folio) { if (folio_test_lru(folio) && folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { struct folio_batch *fbatch; folio_get(folio); local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree); folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn); local_unlock(&cpu_fbatches.lock); } } void lru_add_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } /* * It's called from per-cpu workqueue context in SMP case so * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on * the same cpu. It shouldn't be a problem in !SMP case since * the core is only one and the locks will disable preemption. */ static void lru_add_and_bh_lrus_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); invalidate_bh_lrus_cpu(); mlock_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_and_bh_lrus_drain(); } static bool cpu_needs_drain(unsigned int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); /* Check these in order of likelihood that they're not zero */ return folio_batch_count(&fbatches->lru_add) || data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || folio_batch_count(&fbatches->lru_deactivate_file) || folio_batch_count(&fbatches->lru_deactivate) || folio_batch_count(&fbatches->lru_lazyfree) || folio_batch_count(&fbatches->activate) || need_mlock_drain(cpu) || has_bh_in_lru(cpu, NULL); } /* * Doesn't need any cpu hotplug locking because we do rely on per-cpu * kworkers being shut down before our page_alloc_cpu_dead callback is * executed on the offlined cpu. * Calling this function with cpu hotplug locks held can actually lead * to obscure indirect dependencies via WQ context. */ static inline void __lru_add_drain_all(bool force_all_cpus) { /* * lru_drain_gen - Global pages generation number * * (A) Definition: global lru_drain_gen = x implies that all generations * 0 < n <= x are already *scheduled* for draining. * * This is an optimization for the highly-contended use case where a * user space workload keeps constantly generating a flow of pages for * each CPU. */ static unsigned int lru_drain_gen; static struct cpumask has_work; static DEFINE_MUTEX(lock); unsigned cpu, this_gen; /* * Make sure nobody triggers this path before mm_percpu_wq is fully * initialized. */ if (WARN_ON(!mm_percpu_wq)) return; /* * Guarantee folio_batch counter stores visible by this CPU * are visible to other CPUs before loading the current drain * generation. */ smp_mb(); /* * (B) Locally cache global LRU draining generation number * * The read barrier ensures that the counter is loaded before the mutex * is taken. It pairs with smp_mb() inside the mutex critical section * at (D). */ this_gen = smp_load_acquire(&lru_drain_gen); mutex_lock(&lock); /* * (C) Exit the draining operation if a newer generation, from another * lru_add_drain_all(), was already scheduled for draining. Check (A). */ if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) goto done; /* * (D) Increment global generation number * * Pairs with smp_load_acquire() at (B), outside of the critical * section. Use a full memory barrier to guarantee that the * new global drain generation number is stored before loading * folio_batch counters. * * This pairing must be done here, before the for_each_online_cpu loop * below which drains the page vectors. * * Let x, y, and z represent some system CPU numbers, where x < y < z. * Assume CPU #z is in the middle of the for_each_online_cpu loop * below and has already reached CPU #y's per-cpu data. CPU #x comes * along, adds some pages to its per-cpu vectors, then calls * lru_add_drain_all(). * * If the paired barrier is done at any later step, e.g. after the * loop, CPU #x will just exit at (C) and miss flushing out all of its * added pages. */ WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); smp_mb(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (cpu_needs_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); } } for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); done: mutex_unlock(&lock); } void lru_add_drain_all(void) { __lru_add_drain_all(false); } #else void lru_add_drain_all(void) { lru_add_drain(); } #endif /* CONFIG_SMP */ atomic_t lru_disable_count = ATOMIC_INIT(0); /* * lru_cache_disable() needs to be called before we start compiling * a list of pages to be migrated using isolate_lru_page(). * It drains pages on LRU cache and then disable on all cpus until * lru_cache_enable is called. * * Must be paired with a call to lru_cache_enable(). */ void lru_cache_disable(void) { atomic_inc(&lru_disable_count); /* * Readers of lru_disable_count are protected by either disabling * preemption or rcu_read_lock: * * preempt_disable, local_irq_disable [bh_lru_lock()] * rcu_read_lock [rt_spin_lock CONFIG_PREEMPT_RT] * preempt_disable [local_lock !CONFIG_PREEMPT_RT] * * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on * preempt_disable() regions of code. So any CPU which sees * lru_disable_count = 0 will have exited the critical * section when synchronize_rcu() returns. */ synchronize_rcu_expedited(); #ifdef CONFIG_SMP __lru_add_drain_all(true); #else lru_add_and_bh_lrus_drain(); #endif } /** * release_pages - batched put_page() * @arg: array of pages to release * @nr: number of pages * * Decrement the reference count on all the pages in @arg. If it * fell to zero, remove the page from the LRU and free it. * * Note that the argument can be an array of pages, encoded pages, * or folio pointers. We ignore any encoded bits, and turn any of * them into just a folio that gets free'd. */ void release_pages(release_pages_arg arg, int nr) { int i; struct encoded_page **encoded = arg.encoded_pages; LIST_HEAD(pages_to_free); struct lruvec *lruvec = NULL; unsigned long flags = 0; unsigned int lock_batch; for (i = 0; i < nr; i++) { struct folio *folio; /* Turn any of the argument types into a folio */ folio = page_folio(encoded_page_ptr(encoded[i])); /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the * same lruvec. The lock is held only if lruvec != NULL. */ if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } if (is_huge_zero_page(&folio->page)) continue; if (folio_is_zone_device(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } if (put_devmap_managed_page(&folio->page)) continue; if (folio_put_testzero(folio)) free_zone_device_page(&folio->page); continue; } if (!folio_put_testzero(folio)) continue; if (folio_test_large(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } __folio_put_large(folio); continue; } if (folio_test_lru(folio)) { struct lruvec *prev_lruvec = lruvec; lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); if (prev_lruvec != lruvec) lock_batch = 0; lruvec_del_folio(lruvec, folio); __folio_clear_lru_flags(folio); } /* * In rare cases, when truncation or holepunching raced with * munlock after VM_LOCKED was cleared, Mlocked may still be * found set here. This does not indicate a problem, unless * "unevictable_pgs_cleared" appears worryingly large. */ if (unlikely(folio_test_mlocked(folio))) { __folio_clear_mlocked(folio); zone_stat_sub_folio(folio, NR_MLOCK); count_vm_event(UNEVICTABLE_PGCLEARED); } list_add(&folio->lru, &pages_to_free); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); mem_cgroup_uncharge_list(&pages_to_free); free_unref_page_list(&pages_to_free); } EXPORT_SYMBOL(release_pages); /* * The folios which we're about to release may be in the deferred lru-addition * queues. That would prevent them from really being freed right now. That's * OK from a correctness point of view but is inefficient - those folios may be * cache-warm and we want to give them back to the page allocator ASAP. * * So __folio_batch_release() will drain those queues here. * folio_batch_move_lru() calls folios_put() directly to avoid * mutual recursion. */ void __folio_batch_release(struct folio_batch *fbatch) { if (!fbatch->percpu_pvec_drained) { lru_add_drain(); fbatch->percpu_pvec_drained = true; } release_pages(fbatch->folios, folio_batch_count(fbatch)); folio_batch_reinit(fbatch); } EXPORT_SYMBOL(__folio_batch_release); /** * folio_batch_remove_exceptionals() - Prune non-folios from a batch. * @fbatch: The batch to prune * * find_get_entries() fills a batch with both folios and shadow/swap/DAX * entries. This function prunes all the non-folio entries from @fbatch * without leaving holes, so that it can be passed on to folio-only batch * operations. */ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) { unsigned int i, j; for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; if (!xa_is_value(folio)) fbatch->folios[j++] = folio; } fbatch->nr = j; } /* * Perform any setup for the swap system */ void __init swap_setup(void) { unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ if (megs < 16) page_cluster = 2; else page_cluster = 3; /* * Right now other parts of the system means that we * _really_ don't want to cluster much more */ }
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> * Lauro Ramos Venancio <lauro.venancio@openbossa.org> */ #include <linux/nfc.h> #include <linux/module.h> #include "nfc.h" static DEFINE_RWLOCK(proto_tab_lock); static const struct nfc_protocol *proto_tab[NFC_SOCKPROTO_MAX]; static int nfc_sock_create(struct net *net, struct socket *sock, int proto, int kern) { int rc = -EPROTONOSUPPORT; if (net != &init_net) return -EAFNOSUPPORT; if (proto < 0 || proto >= NFC_SOCKPROTO_MAX) return -EINVAL; read_lock(&proto_tab_lock); if (proto_tab[proto] && try_module_get(proto_tab[proto]->owner)) { rc = proto_tab[proto]->create(net, sock, proto_tab[proto], kern); module_put(proto_tab[proto]->owner); } read_unlock(&proto_tab_lock); return rc; } static const struct net_proto_family nfc_sock_family_ops = { .owner = THIS_MODULE, .family = PF_NFC, .create = nfc_sock_create, }; int nfc_proto_register(const struct nfc_protocol *nfc_proto) { int rc; if (nfc_proto->id < 0 || nfc_proto->id >= NFC_SOCKPROTO_MAX) return -EINVAL; rc = proto_register(nfc_proto->proto, 0); if (rc) return rc; write_lock(&proto_tab_lock); if (proto_tab[nfc_proto->id]) rc = -EBUSY; else proto_tab[nfc_proto->id] = nfc_proto; write_unlock(&proto_tab_lock); if (rc) proto_unregister(nfc_proto->proto); return rc; } EXPORT_SYMBOL(nfc_proto_register); void nfc_proto_unregister(const struct nfc_protocol *nfc_proto) { write_lock(&proto_tab_lock); proto_tab[nfc_proto->id] = NULL; write_unlock(&proto_tab_lock); proto_unregister(nfc_proto->proto); } EXPORT_SYMBOL(nfc_proto_unregister); int __init af_nfc_init(void) { return sock_register(&nfc_sock_family_ops); } void __exit af_nfc_exit(void) { sock_unregister(PF_NFC); }
1 3 3 1 2 1 2 3 7 7 7 7 7 7 3 2 3 2 1 7 7 7 3 3 1 2 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 // SPDX-License-Identifier: GPL-2.0-or-later /* * udp_diag.c Module for monitoring UDP transport protocols sockets. * * Authors: Pavel Emelyanov, <xemul@parallels.com> */ #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/udp.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/sock_diag.h> static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI, net_admin); } static int udp_dump_one(struct udp_table *tbl, struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; int err; struct sock *sk = NULL; struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); rcu_read_lock(); if (req->sdiag_family == AF_INET) /* src and dst are swapped for historical reasons */ sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #endif if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); err = -ENOENT; if (!sk) goto out_nosk; err = sock_diag_check_cookie(sk, req->id.idiag_cookie); if (err) goto out; err = -ENOMEM; rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) goto out; err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); goto out; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); out: if (sk) sock_put(sk); out_nosk: return err; } static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct nlattr *bc; cb_data = cb->data; bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { struct udp_hslot *hslot = &table->hash[slot]; struct sock *sk; num = 0; if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); sk_for_each(sk, &hslot->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; if (!(r->idiag_states & (1 << sk->sk_state))) goto next; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } next: num++; } spin_unlock_bh(&hslot->lock); } done: cb->args[0] = slot; cb->args[1] = num; } static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(sock_net(cb->skb->sk)->ipv4.udp_table, skb, cb, r); } static int udp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(sock_net(cb->skb->sk)->ipv4.udp_table, cb, req); } static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { r->idiag_rqueue = udp_rqueue_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } #ifdef CONFIG_INET_DIAG_DESTROY static int __udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req, struct udp_table *tbl) { struct net *net = sock_net(in_skb->sk); struct sock *sk; int err; rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = __udp4_lib_lookup(net, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) sk = __udp4_lib_lookup(net, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); else sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); } #endif else { rcu_read_unlock(); return -EINVAL; } if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); if (!sk) return -ENOENT; if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { sock_put(sk); return -ENOENT; } err = sock_diag_destroy(sk, ECONNABORTED); sock_put(sk); return err; } static int udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table); } static int udplite_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, &udplite_table); } #endif static const struct inet_diag_handler udp_diag_handler = { .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udp_diag_destroy, #endif }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(&udplite_table, skb, cb, r); } static int udplite_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(&udplite_table, cb, req); } static const struct inet_diag_handler udplite_diag_handler = { .dump = udplite_diag_dump, .dump_one = udplite_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udplite_diag_destroy, #endif }; static int __init udp_diag_init(void) { int err; err = inet_diag_register(&udp_diag_handler); if (err) goto out; err = inet_diag_register(&udplite_diag_handler); if (err) goto out_lite; out: return err; out_lite: inet_diag_unregister(&udp_diag_handler); goto out; } static void __exit udp_diag_exit(void) { inet_diag_unregister(&udplite_diag_handler); inet_diag_unregister(&udp_diag_handler); } module_init(udp_diag_init); module_exit(udp_diag_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli */ #ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ #define _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ #include "main.h" #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/types.h> int batadv_tt_init(struct batadv_priv *bat_priv); bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, unsigned short vid, int ifindex, u32 mark); u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid, const char *message, bool roaming); int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb); int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb); void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, s32 match_vid, const char *message); struct batadv_tt_global_entry * batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); void batadv_tt_global_entry_release(struct kref *ref); int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, const u8 *src, const u8 *addr, unsigned short vid); void batadv_tt_free(struct batadv_priv *bat_priv); bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, unsigned short vid); void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv); bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface); bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid); bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); int batadv_tt_cache_init(void); void batadv_tt_cache_destroy(void); /** * batadv_tt_global_entry_put() - decrement the tt_global_entry refcounter and * possibly release it * @tt_global_entry: tt_global_entry to be free'd */ static inline void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) { if (!tt_global_entry) return; kref_put(&tt_global_entry->common.refcount, batadv_tt_global_entry_release); } #endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */
21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM irq #if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IRQ_H #include <linux/tracepoint.h> struct irqaction; struct softirq_action; #define SOFTIRQ_NAME_LIST \ softirq_name(HI) \ softirq_name(TIMER) \ softirq_name(NET_TX) \ softirq_name(NET_RX) \ softirq_name(BLOCK) \ softirq_name(IRQ_POLL) \ softirq_name(TASKLET) \ softirq_name(SCHED) \ softirq_name(HRTIMER) \ softirq_name_end(RCU) #undef softirq_name #undef softirq_name_end #define softirq_name(sirq) TRACE_DEFINE_ENUM(sirq##_SOFTIRQ); #define softirq_name_end(sirq) TRACE_DEFINE_ENUM(sirq##_SOFTIRQ); SOFTIRQ_NAME_LIST #undef softirq_name #undef softirq_name_end #define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq }, #define softirq_name_end(sirq) { sirq##_SOFTIRQ, #sirq } #define show_softirq_name(val) \ __print_symbolic(val, SOFTIRQ_NAME_LIST) /** * irq_handler_entry - called immediately before the irq action handler * @irq: irq number * @action: pointer to struct irqaction * * The struct irqaction pointed to by @action contains various * information about the handler, including the device name, * @action->name, and the device id, @action->dev_id. When used in * conjunction with the irq_handler_exit tracepoint, we can figure * out irq handler latencies. */ TRACE_EVENT(irq_handler_entry, TP_PROTO(int irq, struct irqaction *action), TP_ARGS(irq, action), TP_STRUCT__entry( __field( int, irq ) __string( name, action->name ) ), TP_fast_assign( __entry->irq = irq; __assign_str(name, action->name); ), TP_printk("irq=%d name=%s", __entry->irq, __get_str(name)) ); /** * irq_handler_exit - called immediately after the irq action handler returns * @irq: irq number * @action: pointer to struct irqaction * @ret: return value * * If the @ret value is set to IRQ_HANDLED, then we know that the corresponding * @action->handler successfully handled this irq. Otherwise, the irq might be * a shared irq line, or the irq was not handled successfully. Can be used in * conjunction with the irq_handler_entry to understand irq handler latencies. */ TRACE_EVENT(irq_handler_exit, TP_PROTO(int irq, struct irqaction *action, int ret), TP_ARGS(irq, action, ret), TP_STRUCT__entry( __field( int, irq ) __field( int, ret ) ), TP_fast_assign( __entry->irq = irq; __entry->ret = ret; ), TP_printk("irq=%d ret=%s", __entry->irq, __entry->ret ? "handled" : "unhandled") ); DECLARE_EVENT_CLASS(softirq, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr), TP_STRUCT__entry( __field( unsigned int, vec ) ), TP_fast_assign( __entry->vec = vec_nr; ), TP_printk("vec=%u [action=%s]", __entry->vec, show_softirq_name(__entry->vec)) ); /** * softirq_entry - called immediately before the softirq handler * @vec_nr: softirq vector number * * When used in combination with the softirq_exit tracepoint * we can determine the softirq handler routine. */ DEFINE_EVENT(softirq, softirq_entry, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr) ); /** * softirq_exit - called immediately after the softirq handler returns * @vec_nr: softirq vector number * * When used in combination with the softirq_entry tracepoint * we can determine the softirq handler routine. */ DEFINE_EVENT(softirq, softirq_exit, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr) ); /** * softirq_raise - called immediately when a softirq is raised * @vec_nr: softirq vector number * * When used in combination with the softirq_entry tracepoint * we can determine the softirq raise to run latency. */ DEFINE_EVENT(softirq, softirq_raise, TP_PROTO(unsigned int vec_nr), TP_ARGS(vec_nr) ); DECLARE_EVENT_CLASS(tasklet, TP_PROTO(struct tasklet_struct *t, void *func), TP_ARGS(t, func), TP_STRUCT__entry( __field( void *, tasklet) __field( void *, func) ), TP_fast_assign( __entry->tasklet = t; __entry->func = func; ), TP_printk("tasklet=%ps function=%ps", __entry->tasklet, __entry->func) ); /** * tasklet_entry - called immediately before the tasklet is run * @t: tasklet pointer * @func: tasklet callback or function being run * * Used to find individual tasklet execution time */ DEFINE_EVENT(tasklet, tasklet_entry, TP_PROTO(struct tasklet_struct *t, void *func), TP_ARGS(t, func) ); /** * tasklet_exit - called immediately after the tasklet is run * @t: tasklet pointer * @func: tasklet callback or function being run * * Used to find individual tasklet execution time */ DEFINE_EVENT(tasklet, tasklet_exit, TP_PROTO(struct tasklet_struct *t, void *func), TP_ARGS(t, func) ); #endif /* _TRACE_IRQ_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
337 325 658 325 660 660 657 659 672 671 669 327 325 660 341 337 227 6 5 3819 3825 2072 1620 3827 676 671 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/network.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /* Structure for holding inet domain socket's address. */ struct tomoyo_inet_addr_info { __be16 port; /* In network byte order. */ const __be32 *address; /* In network byte order. */ bool is_ipv6; }; /* Structure for holding unix domain socket's address. */ struct tomoyo_unix_addr_info { u8 *addr; /* This may not be '\0' terminated string. */ unsigned int addr_len; }; /* Structure for holding socket address. */ struct tomoyo_addr_info { u8 protocol; u8 operation; struct tomoyo_inet_addr_info inet; struct tomoyo_unix_addr_info unix0; }; /* String table for socket's protocols. */ const char * const tomoyo_proto_keyword[TOMOYO_SOCK_MAX] = { [SOCK_STREAM] = "stream", [SOCK_DGRAM] = "dgram", [SOCK_RAW] = "raw", [SOCK_SEQPACKET] = "seqpacket", [0] = " ", /* Dummy for avoiding NULL pointer dereference. */ [4] = " ", /* Dummy for avoiding NULL pointer dereference. */ }; /** * tomoyo_parse_ipaddr_union - Parse an IP address. * * @param: Pointer to "struct tomoyo_acl_param". * @ptr: Pointer to "struct tomoyo_ipaddr_union". * * Returns true on success, false otherwise. */ bool tomoyo_parse_ipaddr_union(struct tomoyo_acl_param *param, struct tomoyo_ipaddr_union *ptr) { u8 * const min = ptr->ip[0].in6_u.u6_addr8; u8 * const max = ptr->ip[1].in6_u.u6_addr8; char *address = tomoyo_read_token(param); const char *end; if (!strchr(address, ':') && in4_pton(address, -1, min, '-', &end) > 0) { ptr->is_ipv6 = false; if (!*end) ptr->ip[1].s6_addr32[0] = ptr->ip[0].s6_addr32[0]; else if (*end++ != '-' || in4_pton(end, -1, max, '\0', &end) <= 0 || *end) return false; return true; } if (in6_pton(address, -1, min, '-', &end) > 0) { ptr->is_ipv6 = true; if (!*end) memmove(max, min, sizeof(u16) * 8); else if (*end++ != '-' || in6_pton(end, -1, max, '\0', &end) <= 0 || *end) return false; return true; } return false; } /** * tomoyo_print_ipv4 - Print an IPv4 address. * * @buffer: Buffer to write to. * @buffer_len: Size of @buffer. * @min_ip: Pointer to __be32. * @max_ip: Pointer to __be32. * * Returns nothing. */ static void tomoyo_print_ipv4(char *buffer, const unsigned int buffer_len, const __be32 *min_ip, const __be32 *max_ip) { snprintf(buffer, buffer_len, "%pI4%c%pI4", min_ip, *min_ip == *max_ip ? '\0' : '-', max_ip); } /** * tomoyo_print_ipv6 - Print an IPv6 address. * * @buffer: Buffer to write to. * @buffer_len: Size of @buffer. * @min_ip: Pointer to "struct in6_addr". * @max_ip: Pointer to "struct in6_addr". * * Returns nothing. */ static void tomoyo_print_ipv6(char *buffer, const unsigned int buffer_len, const struct in6_addr *min_ip, const struct in6_addr *max_ip) { snprintf(buffer, buffer_len, "%pI6c%c%pI6c", min_ip, !memcmp(min_ip, max_ip, 16) ? '\0' : '-', max_ip); } /** * tomoyo_print_ip - Print an IP address. * * @buf: Buffer to write to. * @size: Size of @buf. * @ptr: Pointer to "struct ipaddr_union". * * Returns nothing. */ void tomoyo_print_ip(char *buf, const unsigned int size, const struct tomoyo_ipaddr_union *ptr) { if (ptr->is_ipv6) tomoyo_print_ipv6(buf, size, &ptr->ip[0], &ptr->ip[1]); else tomoyo_print_ipv4(buf, size, &ptr->ip[0].s6_addr32[0], &ptr->ip[1].s6_addr32[0]); } /* * Mapping table from "enum tomoyo_network_acl_index" to * "enum tomoyo_mac_index" for inet domain socket. */ static const u8 tomoyo_inet2mac [TOMOYO_SOCK_MAX][TOMOYO_MAX_NETWORK_OPERATION] = { [SOCK_STREAM] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_INET_STREAM_BIND, [TOMOYO_NETWORK_LISTEN] = TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN, [TOMOYO_NETWORK_CONNECT] = TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT, }, [SOCK_DGRAM] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_INET_DGRAM_BIND, [TOMOYO_NETWORK_SEND] = TOMOYO_MAC_NETWORK_INET_DGRAM_SEND, }, [SOCK_RAW] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_INET_RAW_BIND, [TOMOYO_NETWORK_SEND] = TOMOYO_MAC_NETWORK_INET_RAW_SEND, }, }; /* * Mapping table from "enum tomoyo_network_acl_index" to * "enum tomoyo_mac_index" for unix domain socket. */ static const u8 tomoyo_unix2mac [TOMOYO_SOCK_MAX][TOMOYO_MAX_NETWORK_OPERATION] = { [SOCK_STREAM] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND, [TOMOYO_NETWORK_LISTEN] = TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN, [TOMOYO_NETWORK_CONNECT] = TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT, }, [SOCK_DGRAM] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND, [TOMOYO_NETWORK_SEND] = TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND, }, [SOCK_SEQPACKET] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND, [TOMOYO_NETWORK_LISTEN] = TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN, [TOMOYO_NETWORK_CONNECT] = TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT, }, }; /** * tomoyo_same_inet_acl - Check for duplicated "struct tomoyo_inet_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_inet_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_inet_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_inet_acl *p2 = container_of(b, typeof(*p2), head); return p1->protocol == p2->protocol && tomoyo_same_ipaddr_union(&p1->address, &p2->address) && tomoyo_same_number_union(&p1->port, &p2->port); } /** * tomoyo_same_unix_acl - Check for duplicated "struct tomoyo_unix_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_unix_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_unix_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_unix_acl *p2 = container_of(b, typeof(*p2), head); return p1->protocol == p2->protocol && tomoyo_same_name_union(&p1->name, &p2->name); } /** * tomoyo_merge_inet_acl - Merge duplicated "struct tomoyo_inet_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_inet_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 * const a_perm = &container_of(a, struct tomoyo_inet_acl, head)->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_inet_acl, head)->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_merge_unix_acl - Merge duplicated "struct tomoyo_unix_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_unix_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 * const a_perm = &container_of(a, struct tomoyo_unix_acl, head)->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_unix_acl, head)->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_write_inet_network - Write "struct tomoyo_inet_acl" list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_write_inet_network(struct tomoyo_acl_param *param) { struct tomoyo_inet_acl e = { .head.type = TOMOYO_TYPE_INET_ACL }; int error = -EINVAL; u8 type; const char *protocol = tomoyo_read_token(param); const char *operation = tomoyo_read_token(param); for (e.protocol = 0; e.protocol < TOMOYO_SOCK_MAX; e.protocol++) if (!strcmp(protocol, tomoyo_proto_keyword[e.protocol])) break; for (type = 0; type < TOMOYO_MAX_NETWORK_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_socket_keyword[type])) e.perm |= 1 << type; if (e.protocol == TOMOYO_SOCK_MAX || !e.perm) return -EINVAL; if (param->data[0] == '@') { param->data++; e.address.group = tomoyo_get_group(param, TOMOYO_ADDRESS_GROUP); if (!e.address.group) return -ENOMEM; } else { if (!tomoyo_parse_ipaddr_union(param, &e.address)) goto out; } if (!tomoyo_parse_number_union(param, &e.port) || e.port.values[1] > 65535) goto out; error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_inet_acl, tomoyo_merge_inet_acl); out: tomoyo_put_group(e.address.group); tomoyo_put_number_union(&e.port); return error; } /** * tomoyo_write_unix_network - Write "struct tomoyo_unix_acl" list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. */ int tomoyo_write_unix_network(struct tomoyo_acl_param *param) { struct tomoyo_unix_acl e = { .head.type = TOMOYO_TYPE_UNIX_ACL }; int error; u8 type; const char *protocol = tomoyo_read_token(param); const char *operation = tomoyo_read_token(param); for (e.protocol = 0; e.protocol < TOMOYO_SOCK_MAX; e.protocol++) if (!strcmp(protocol, tomoyo_proto_keyword[e.protocol])) break; for (type = 0; type < TOMOYO_MAX_NETWORK_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_socket_keyword[type])) e.perm |= 1 << type; if (e.protocol == TOMOYO_SOCK_MAX || !e.perm) return -EINVAL; if (!tomoyo_parse_name_union(param, &e.name)) return -EINVAL; error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_unix_acl, tomoyo_merge_unix_acl); tomoyo_put_name_union(&e.name); return error; } /** * tomoyo_audit_net_log - Audit network log. * * @r: Pointer to "struct tomoyo_request_info". * @family: Name of socket family ("inet" or "unix"). * @protocol: Name of protocol in @family. * @operation: Name of socket operation. * @address: Name of address. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_net_log(struct tomoyo_request_info *r, const char *family, const u8 protocol, const u8 operation, const char *address) { return tomoyo_supervisor(r, "network %s %s %s %s\n", family, tomoyo_proto_keyword[protocol], tomoyo_socket_keyword[operation], address); } /** * tomoyo_audit_inet_log - Audit INET network log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_inet_log(struct tomoyo_request_info *r) { char buf[128]; int len; const __be32 *address = r->param.inet_network.address; if (r->param.inet_network.is_ipv6) tomoyo_print_ipv6(buf, sizeof(buf), (const struct in6_addr *) address, (const struct in6_addr *) address); else tomoyo_print_ipv4(buf, sizeof(buf), address, address); len = strlen(buf); snprintf(buf + len, sizeof(buf) - len, " %u", r->param.inet_network.port); return tomoyo_audit_net_log(r, "inet", r->param.inet_network.protocol, r->param.inet_network.operation, buf); } /** * tomoyo_audit_unix_log - Audit UNIX network log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_unix_log(struct tomoyo_request_info *r) { return tomoyo_audit_net_log(r, "unix", r->param.unix_network.protocol, r->param.unix_network.operation, r->param.unix_network.address->name); } /** * tomoyo_check_inet_acl - Check permission for inet domain socket operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_inet_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_inet_acl *acl = container_of(ptr, typeof(*acl), head); const u8 size = r->param.inet_network.is_ipv6 ? 16 : 4; if (!(acl->perm & (1 << r->param.inet_network.operation)) || !tomoyo_compare_number_union(r->param.inet_network.port, &acl->port)) return false; if (acl->address.group) return tomoyo_address_matches_group (r->param.inet_network.is_ipv6, r->param.inet_network.address, acl->address.group); return acl->address.is_ipv6 == r->param.inet_network.is_ipv6 && memcmp(&acl->address.ip[0], r->param.inet_network.address, size) <= 0 && memcmp(r->param.inet_network.address, &acl->address.ip[1], size) <= 0; } /** * tomoyo_check_unix_acl - Check permission for unix domain socket operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_unix_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_unix_acl *acl = container_of(ptr, typeof(*acl), head); return (acl->perm & (1 << r->param.unix_network.operation)) && tomoyo_compare_name_union(r->param.unix_network.address, &acl->name); } /** * tomoyo_inet_entry - Check permission for INET network operation. * * @address: Pointer to "struct tomoyo_addr_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_inet_entry(const struct tomoyo_addr_info *address) { const int idx = tomoyo_read_lock(); struct tomoyo_request_info r; int error = 0; const u8 type = tomoyo_inet2mac[address->protocol][address->operation]; if (type && tomoyo_init_request_info(&r, NULL, type) != TOMOYO_CONFIG_DISABLED) { r.param_type = TOMOYO_TYPE_INET_ACL; r.param.inet_network.protocol = address->protocol; r.param.inet_network.operation = address->operation; r.param.inet_network.is_ipv6 = address->inet.is_ipv6; r.param.inet_network.address = address->inet.address; r.param.inet_network.port = ntohs(address->inet.port); do { tomoyo_check_acl(&r, tomoyo_check_inet_acl); error = tomoyo_audit_inet_log(&r); } while (error == TOMOYO_RETRY_REQUEST); } tomoyo_read_unlock(idx); return error; } /** * tomoyo_check_inet_address - Check permission for inet domain socket's operation. * * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * @port: Port number. * @address: Pointer to "struct tomoyo_addr_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_check_inet_address(const struct sockaddr *addr, const unsigned int addr_len, const u16 port, struct tomoyo_addr_info *address) { struct tomoyo_inet_addr_info *i = &address->inet; if (addr_len < offsetofend(struct sockaddr, sa_family)) return 0; switch (addr->sa_family) { case AF_INET6: if (addr_len < SIN6_LEN_RFC2133) goto skip; i->is_ipv6 = true; i->address = (__be32 *) ((struct sockaddr_in6 *) addr)->sin6_addr.s6_addr; i->port = ((struct sockaddr_in6 *) addr)->sin6_port; break; case AF_INET: if (addr_len < sizeof(struct sockaddr_in)) goto skip; i->is_ipv6 = false; i->address = (__be32 *) &((struct sockaddr_in *) addr)->sin_addr; i->port = ((struct sockaddr_in *) addr)->sin_port; break; default: goto skip; } if (address->protocol == SOCK_RAW) i->port = htons(port); return tomoyo_inet_entry(address); skip: return 0; } /** * tomoyo_unix_entry - Check permission for UNIX network operation. * * @address: Pointer to "struct tomoyo_addr_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_unix_entry(const struct tomoyo_addr_info *address) { const int idx = tomoyo_read_lock(); struct tomoyo_request_info r; int error = 0; const u8 type = tomoyo_unix2mac[address->protocol][address->operation]; if (type && tomoyo_init_request_info(&r, NULL, type) != TOMOYO_CONFIG_DISABLED) { char *buf = address->unix0.addr; int len = address->unix0.addr_len - sizeof(sa_family_t); if (len <= 0) { buf = "anonymous"; len = 9; } else if (buf[0]) { len = strnlen(buf, len); } buf = tomoyo_encode2(buf, len); if (buf) { struct tomoyo_path_info addr; addr.name = buf; tomoyo_fill_path_info(&addr); r.param_type = TOMOYO_TYPE_UNIX_ACL; r.param.unix_network.protocol = address->protocol; r.param.unix_network.operation = address->operation; r.param.unix_network.address = &addr; do { tomoyo_check_acl(&r, tomoyo_check_unix_acl); error = tomoyo_audit_unix_log(&r); } while (error == TOMOYO_RETRY_REQUEST); kfree(buf); } else error = -ENOMEM; } tomoyo_read_unlock(idx); return error; } /** * tomoyo_check_unix_address - Check permission for unix domain socket's operation. * * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * @address: Pointer to "struct tomoyo_addr_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_check_unix_address(struct sockaddr *addr, const unsigned int addr_len, struct tomoyo_addr_info *address) { struct tomoyo_unix_addr_info *u = &address->unix0; if (addr_len < offsetofend(struct sockaddr, sa_family)) return 0; if (addr->sa_family != AF_UNIX) return 0; u->addr = ((struct sockaddr_un *) addr)->sun_path; u->addr_len = addr_len; return tomoyo_unix_entry(address); } /** * tomoyo_kernel_service - Check whether I'm kernel service or not. * * Returns true if I'm kernel service, false otherwise. */ static bool tomoyo_kernel_service(void) { /* Nothing to do if I am a kernel service. */ return current->flags & PF_KTHREAD; } /** * tomoyo_sock_family - Get socket's family. * * @sk: Pointer to "struct sock". * * Returns one of PF_INET, PF_INET6, PF_UNIX or 0. */ static u8 tomoyo_sock_family(struct sock *sk) { u8 family; if (tomoyo_kernel_service()) return 0; family = sk->sk_family; switch (family) { case PF_INET: case PF_INET6: case PF_UNIX: return family; default: return 0; } } /** * tomoyo_socket_listen_permission - Check permission for listening a socket. * * @sock: Pointer to "struct socket". * * Returns 0 on success, negative value otherwise. */ int tomoyo_socket_listen_permission(struct socket *sock) { struct tomoyo_addr_info address; const u8 family = tomoyo_sock_family(sock->sk); const unsigned int type = sock->type; struct sockaddr_storage addr; int addr_len; if (!family || (type != SOCK_STREAM && type != SOCK_SEQPACKET)) return 0; { const int error = sock->ops->getname(sock, (struct sockaddr *) &addr, 0); if (error < 0) return error; addr_len = error; } address.protocol = type; address.operation = TOMOYO_NETWORK_LISTEN; if (family == PF_UNIX) return tomoyo_check_unix_address((struct sockaddr *) &addr, addr_len, &address); return tomoyo_check_inet_address((struct sockaddr *) &addr, addr_len, 0, &address); } /** * tomoyo_socket_connect_permission - Check permission for setting the remote address of a socket. * * @sock: Pointer to "struct socket". * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * * Returns 0 on success, negative value otherwise. */ int tomoyo_socket_connect_permission(struct socket *sock, struct sockaddr *addr, int addr_len) { struct tomoyo_addr_info address; const u8 family = tomoyo_sock_family(sock->sk); const unsigned int type = sock->type; if (!family) return 0; address.protocol = type; switch (type) { case SOCK_DGRAM: case SOCK_RAW: address.operation = TOMOYO_NETWORK_SEND; break; case SOCK_STREAM: case SOCK_SEQPACKET: address.operation = TOMOYO_NETWORK_CONNECT; break; default: return 0; } if (family == PF_UNIX) return tomoyo_check_unix_address(addr, addr_len, &address); return tomoyo_check_inet_address(addr, addr_len, sock->sk->sk_protocol, &address); } /** * tomoyo_socket_bind_permission - Check permission for setting the local address of a socket. * * @sock: Pointer to "struct socket". * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * * Returns 0 on success, negative value otherwise. */ int tomoyo_socket_bind_permission(struct socket *sock, struct sockaddr *addr, int addr_len) { struct tomoyo_addr_info address; const u8 family = tomoyo_sock_family(sock->sk); const unsigned int type = sock->type; if (!family) return 0; switch (type) { case SOCK_STREAM: case SOCK_DGRAM: case SOCK_RAW: case SOCK_SEQPACKET: address.protocol = type; address.operation = TOMOYO_NETWORK_BIND; break; default: return 0; } if (family == PF_UNIX) return tomoyo_check_unix_address(addr, addr_len, &address); return tomoyo_check_inet_address(addr, addr_len, sock->sk->sk_protocol, &address); } /** * tomoyo_socket_sendmsg_permission - Check permission for sending a datagram. * * @sock: Pointer to "struct socket". * @msg: Pointer to "struct msghdr". * @size: Unused. * * Returns 0 on success, negative value otherwise. */ int tomoyo_socket_sendmsg_permission(struct socket *sock, struct msghdr *msg, int size) { struct tomoyo_addr_info address; const u8 family = tomoyo_sock_family(sock->sk); const unsigned int type = sock->type; if (!msg->msg_name || !family || (type != SOCK_DGRAM && type != SOCK_RAW)) return 0; address.protocol = type; address.operation = TOMOYO_NETWORK_SEND; if (family == PF_UNIX) return tomoyo_check_unix_address((struct sockaddr *) msg->msg_name, msg->msg_namelen, &address); return tomoyo_check_inet_address((struct sockaddr *) msg->msg_name, msg->msg_namelen, sock->sk->sk_protocol, &address); }
2 2 2 105 105 105 105 2 105 2 2 2 2 105 96 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2004, Instant802 Networks, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2022 Intel Corporation */ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/types.h> #include <net/ip.h> #include <net/pkt_sched.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "wme.h" /* Default mapping in classifier to work with default * queue setup. */ const int ieee802_1d_to_ac[8] = { IEEE80211_AC_BE, IEEE80211_AC_BK, IEEE80211_AC_BK, IEEE80211_AC_BE, IEEE80211_AC_VI, IEEE80211_AC_VI, IEEE80211_AC_VO, IEEE80211_AC_VO }; static int wme_downgrade_ac(struct sk_buff *skb) { switch (skb->priority) { case 6: case 7: skb->priority = 5; /* VO -> VI */ return 0; case 4: case 5: skb->priority = 3; /* VI -> BE */ return 0; case 0: case 3: skb->priority = 2; /* BE -> BK */ return 0; default: return -1; } } /** * ieee80211_fix_reserved_tid - return the TID to use if this one is reserved * @tid: the assumed-reserved TID * * Returns: the alternative TID to use, or 0 on error */ static inline u8 ieee80211_fix_reserved_tid(u8 tid) { switch (tid) { case 0: return 3; case 1: return 2; case 2: return 1; case 3: return 0; case 4: return 5; case 5: return 4; case 6: return 7; case 7: return 6; } return 0; } static u16 ieee80211_downgrade_queue(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; /* in case we are a client verify acm is not set for this ac */ while (sdata->wmm_acm & BIT(skb->priority)) { int ac = ieee802_1d_to_ac[skb->priority]; if (ifmgd->tx_tspec[ac].admitted_time && skb->priority == ifmgd->tx_tspec[ac].up) return ac; if (wme_downgrade_ac(skb)) { /* * This should not really happen. The AP has marked all * lower ACs to require admission control which is not * a reasonable configuration. Allow the frame to be * transmitted using AC_BK as a workaround. */ break; } } /* Check to see if this is a reserved TID */ if (sta && sta->reserved_tid == skb->priority) skb->priority = ieee80211_fix_reserved_tid(skb->priority); /* look up which queue to use for frames with this 1d tag */ return ieee802_1d_to_ac[skb->priority]; } /* Indicate which queue to use for this fully formed 802.11 frame */ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_hdr *hdr) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u8 *p; /* Ensure hash is set prior to potential SW encryption */ skb_get_hash(skb); if ((info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER) || local->hw.queues < IEEE80211_NUM_ACS) return 0; if (!ieee80211_is_data(hdr->frame_control)) { skb->priority = 7; return ieee802_1d_to_ac[skb->priority]; } if (!ieee80211_is_data_qos(hdr->frame_control)) { skb->priority = 0; return ieee802_1d_to_ac[skb->priority]; } p = ieee80211_get_qos_ctl(hdr); skb->priority = *p & IEEE80211_QOS_CTL_TAG1D_MASK; return ieee80211_downgrade_queue(sdata, NULL, skb); } u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct sk_buff *skb) { const struct ethhdr *eth = (void *)skb->data; struct mac80211_qos_map *qos_map; bool qos; /* Ensure hash is set prior to potential SW encryption */ skb_get_hash(skb); /* all mesh/ocb stations are required to support WME */ if ((sdata->vif.type == NL80211_IFTYPE_MESH_POINT && !is_multicast_ether_addr(eth->h_dest)) || (sdata->vif.type == NL80211_IFTYPE_OCB && sta)) qos = true; else if (sta) qos = sta->sta.wme; else qos = false; if (!qos) { skb->priority = 0; /* required for correct WPA/11i MIC */ return IEEE80211_AC_BE; } if (skb->protocol == sdata->control_port_protocol) { skb->priority = 7; goto downgrade; } /* use the data classifier to determine what 802.1d tag the * data frame has */ qos_map = rcu_dereference(sdata->qos_map); skb->priority = cfg80211_classify8021d(skb, qos_map ? &qos_map->qos_map : NULL); downgrade: return ieee80211_downgrade_queue(sdata, sta, skb); } /** * ieee80211_set_qos_hdr - Fill in the QoS header if there is one. * * @sdata: local subif * @skb: packet to be updated */ void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; u8 flags; u8 *p; if (!ieee80211_is_data_qos(hdr->frame_control)) return; p = ieee80211_get_qos_ctl(hdr); /* don't overwrite the QoS field of injected frames */ if (info->flags & IEEE80211_TX_CTL_INJECTED) { /* do take into account Ack policy of injected frames */ if (*p & IEEE80211_QOS_CTL_ACK_POLICY_NOACK) info->flags |= IEEE80211_TX_CTL_NO_ACK; return; } /* set up the first byte */ /* * preserve everything but the TID and ACK policy * (which we both write here) */ flags = *p & ~(IEEE80211_QOS_CTL_TID_MASK | IEEE80211_QOS_CTL_ACK_POLICY_MASK); if (is_multicast_ether_addr(hdr->addr1) || sdata->noack_map & BIT(tid)) { flags |= IEEE80211_QOS_CTL_ACK_POLICY_NOACK; info->flags |= IEEE80211_TX_CTL_NO_ACK; } *p = flags | tid; /* set up the second byte */ p++; if (ieee80211_vif_is_mesh(&sdata->vif)) { /* preserve RSPI and Mesh PS Level bit */ *p &= ((IEEE80211_QOS_CTL_RSPI | IEEE80211_QOS_CTL_MESH_PS_LEVEL) >> 8); /* Nulls don't have a mesh header (frame body) */ if (!ieee80211_is_qos_nullfunc(hdr->frame_control)) *p |= (IEEE80211_QOS_CTL_MESH_CONTROL_PRESENT >> 8); } else { *p = 0; } }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 /* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004, 2020 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved. */ #ifndef IB_VERBS_H #define IB_VERBS_H #include <linux/ethtool.h> #include <linux/types.h> #include <linux/device.h> #include <linux/dma-mapping.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/rwsem.h> #include <linux/workqueue.h> #include <linux/irq_poll.h> #include <uapi/linux/if_ether.h> #include <net/ipv6.h> #include <net/ip.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/refcount.h> #include <linux/if_link.h> #include <linux/atomic.h> #include <linux/mmu_notifier.h> #include <linux/uaccess.h> #include <linux/cgroup_rdma.h> #include <linux/irqflags.h> #include <linux/preempt.h> #include <linux/dim.h> #include <uapi/rdma/ib_user_verbs.h> #include <rdma/rdma_counter.h> #include <rdma/restrack.h> #include <rdma/signature.h> #include <uapi/rdma/rdma_user_ioctl.h> #include <uapi/rdma/ib_user_ioctl_verbs.h> #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN struct ib_umem_odp; struct ib_uqp_object; struct ib_usrq_object; struct ib_uwq_object; struct rdma_cm_id; struct ib_port; struct hw_stats_device_data; extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; extern struct workqueue_struct *ib_comp_unbound_wq; struct ib_ucq_object; __printf(3, 4) __cold void ibdev_printk(const char *level, const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_emerg(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_alert(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_crit(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_err(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_warn(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_notice(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_info(const struct ib_device *ibdev, const char *format, ...); #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define ibdev_dbg(__dev, format, args...) \ dynamic_ibdev_dbg(__dev, format, ##args) #else __printf(2, 3) __cold static inline void ibdev_dbg(const struct ib_device *ibdev, const char *format, ...) {} #endif #define ibdev_level_ratelimited(ibdev_level, ibdev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ if (__ratelimit(&_rs)) \ ibdev_level(ibdev, fmt, ##__VA_ARGS__); \ } while (0) #define ibdev_emerg_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_emerg, ibdev, fmt, ##__VA_ARGS__) #define ibdev_alert_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_alert, ibdev, fmt, ##__VA_ARGS__) #define ibdev_crit_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_crit, ibdev, fmt, ##__VA_ARGS__) #define ibdev_err_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_err, ibdev, fmt, ##__VA_ARGS__) #define ibdev_warn_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_warn, ibdev, fmt, ##__VA_ARGS__) #define ibdev_notice_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_notice, ibdev, fmt, ##__VA_ARGS__) #define ibdev_info_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_info, ibdev, fmt, ##__VA_ARGS__) #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define ibdev_dbg_ratelimited(ibdev, fmt, ...) \ do { \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ if (DYNAMIC_DEBUG_BRANCH(descriptor) && __ratelimit(&_rs)) \ __dynamic_ibdev_dbg(&descriptor, ibdev, fmt, \ ##__VA_ARGS__); \ } while (0) #else __printf(2, 3) __cold static inline void ibdev_dbg_ratelimited(const struct ib_device *ibdev, const char *format, ...) {} #endif union ib_gid { u8 raw[16]; struct { __be64 subnet_prefix; __be64 interface_id; } global; }; extern union ib_gid zgid; enum ib_gid_type { IB_GID_TYPE_IB = IB_UVERBS_GID_TYPE_IB, IB_GID_TYPE_ROCE = IB_UVERBS_GID_TYPE_ROCE_V1, IB_GID_TYPE_ROCE_UDP_ENCAP = IB_UVERBS_GID_TYPE_ROCE_V2, IB_GID_TYPE_SIZE }; #define ROCE_V2_UDP_DPORT 4791 struct ib_gid_attr { struct net_device __rcu *ndev; struct ib_device *device; union ib_gid gid; enum ib_gid_type gid_type; u16 index; u32 port_num; }; enum { /* set the local administered indication */ IB_SA_WELL_KNOWN_GUID = BIT_ULL(57) | 2, }; enum rdma_transport_type { RDMA_TRANSPORT_IB, RDMA_TRANSPORT_IWARP, RDMA_TRANSPORT_USNIC, RDMA_TRANSPORT_USNIC_UDP, RDMA_TRANSPORT_UNSPECIFIED, }; enum rdma_protocol_type { RDMA_PROTOCOL_IB, RDMA_PROTOCOL_IBOE, RDMA_PROTOCOL_IWARP, RDMA_PROTOCOL_USNIC_UDP }; __attribute_const__ enum rdma_transport_type rdma_node_get_transport(unsigned int node_type); enum rdma_network_type { RDMA_NETWORK_IB, RDMA_NETWORK_ROCE_V1, RDMA_NETWORK_IPV4, RDMA_NETWORK_IPV6 }; static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type) { if (network_type == RDMA_NETWORK_IPV4 || network_type == RDMA_NETWORK_IPV6) return IB_GID_TYPE_ROCE_UDP_ENCAP; else if (network_type == RDMA_NETWORK_ROCE_V1) return IB_GID_TYPE_ROCE; else return IB_GID_TYPE_IB; } static inline enum rdma_network_type rdma_gid_attr_network_type(const struct ib_gid_attr *attr) { if (attr->gid_type == IB_GID_TYPE_IB) return RDMA_NETWORK_IB; if (attr->gid_type == IB_GID_TYPE_ROCE) return RDMA_NETWORK_ROCE_V1; if (ipv6_addr_v4mapped((struct in6_addr *)&attr->gid)) return RDMA_NETWORK_IPV4; else return RDMA_NETWORK_IPV6; } enum rdma_link_layer { IB_LINK_LAYER_UNSPECIFIED, IB_LINK_LAYER_INFINIBAND, IB_LINK_LAYER_ETHERNET, }; enum ib_device_cap_flags { IB_DEVICE_RESIZE_MAX_WR = IB_UVERBS_DEVICE_RESIZE_MAX_WR, IB_DEVICE_BAD_PKEY_CNTR = IB_UVERBS_DEVICE_BAD_PKEY_CNTR, IB_DEVICE_BAD_QKEY_CNTR = IB_UVERBS_DEVICE_BAD_QKEY_CNTR, IB_DEVICE_RAW_MULTI = IB_UVERBS_DEVICE_RAW_MULTI, IB_DEVICE_AUTO_PATH_MIG = IB_UVERBS_DEVICE_AUTO_PATH_MIG, IB_DEVICE_CHANGE_PHY_PORT = IB_UVERBS_DEVICE_CHANGE_PHY_PORT, IB_DEVICE_UD_AV_PORT_ENFORCE = IB_UVERBS_DEVICE_UD_AV_PORT_ENFORCE, IB_DEVICE_CURR_QP_STATE_MOD = IB_UVERBS_DEVICE_CURR_QP_STATE_MOD, IB_DEVICE_SHUTDOWN_PORT = IB_UVERBS_DEVICE_SHUTDOWN_PORT, /* IB_DEVICE_INIT_TYPE = IB_UVERBS_DEVICE_INIT_TYPE, (not in use) */ IB_DEVICE_PORT_ACTIVE_EVENT = IB_UVERBS_DEVICE_PORT_ACTIVE_EVENT, IB_DEVICE_SYS_IMAGE_GUID = IB_UVERBS_DEVICE_SYS_IMAGE_GUID, IB_DEVICE_RC_RNR_NAK_GEN = IB_UVERBS_DEVICE_RC_RNR_NAK_GEN, IB_DEVICE_SRQ_RESIZE = IB_UVERBS_DEVICE_SRQ_RESIZE, IB_DEVICE_N_NOTIFY_CQ = IB_UVERBS_DEVICE_N_NOTIFY_CQ, /* Reserved, old SEND_W_INV = 1 << 16,*/ IB_DEVICE_MEM_WINDOW = IB_UVERBS_DEVICE_MEM_WINDOW, /* * Devices should set IB_DEVICE_UD_IP_SUM if they support * insertion of UDP and TCP checksum on outgoing UD IPoIB * messages and can verify the validity of checksum for * incoming messages. Setting this flag implies that the * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode. */ IB_DEVICE_UD_IP_CSUM = IB_UVERBS_DEVICE_UD_IP_CSUM, IB_DEVICE_XRC = IB_UVERBS_DEVICE_XRC, /* * This device supports the IB "base memory management extension", * which includes support for fast registrations (IB_WR_REG_MR, * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs). This flag should * also be set by any iWarp device which must support FRs to comply * to the iWarp verbs spec. iWarp devices also support the * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the * stag. */ IB_DEVICE_MEM_MGT_EXTENSIONS = IB_UVERBS_DEVICE_MEM_MGT_EXTENSIONS, IB_DEVICE_MEM_WINDOW_TYPE_2A = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2A, IB_DEVICE_MEM_WINDOW_TYPE_2B = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2B, IB_DEVICE_RC_IP_CSUM = IB_UVERBS_DEVICE_RC_IP_CSUM, /* Deprecated. Please use IB_RAW_PACKET_CAP_IP_CSUM. */ IB_DEVICE_RAW_IP_CSUM = IB_UVERBS_DEVICE_RAW_IP_CSUM, IB_DEVICE_MANAGED_FLOW_STEERING = IB_UVERBS_DEVICE_MANAGED_FLOW_STEERING, /* Deprecated. Please use IB_RAW_PACKET_CAP_SCATTER_FCS. */ IB_DEVICE_RAW_SCATTER_FCS = IB_UVERBS_DEVICE_RAW_SCATTER_FCS, /* The device supports padding incoming writes to cacheline. */ IB_DEVICE_PCI_WRITE_END_PADDING = IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING, /* Placement type attributes */ IB_DEVICE_FLUSH_GLOBAL = IB_UVERBS_DEVICE_FLUSH_GLOBAL, IB_DEVICE_FLUSH_PERSISTENT = IB_UVERBS_DEVICE_FLUSH_PERSISTENT, IB_DEVICE_ATOMIC_WRITE = IB_UVERBS_DEVICE_ATOMIC_WRITE, }; enum ib_kernel_cap_flags { /* * This device supports a per-device lkey or stag that can be * used without performing a memory registration for the local * memory. Note that ULPs should never check this flag, but * instead of use the local_dma_lkey flag in the ib_pd structure, * which will always contain a usable lkey. */ IBK_LOCAL_DMA_LKEY = 1 << 0, /* IB_QP_CREATE_INTEGRITY_EN is supported to implement T10-PI */ IBK_INTEGRITY_HANDOVER = 1 << 1, /* IB_ACCESS_ON_DEMAND is supported during reg_user_mr() */ IBK_ON_DEMAND_PAGING = 1 << 2, /* IB_MR_TYPE_SG_GAPS is supported */ IBK_SG_GAPS_REG = 1 << 3, /* Driver supports RDMA_NLDEV_CMD_DELLINK */ IBK_ALLOW_USER_UNREG = 1 << 4, /* ipoib will use IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK */ IBK_BLOCK_MULTICAST_LOOPBACK = 1 << 5, /* iopib will use IB_QP_CREATE_IPOIB_UD_LSO for its QPs */ IBK_UD_TSO = 1 << 6, /* iopib will use the device ops: * get_vf_config * get_vf_guid * get_vf_stats * set_vf_guid * set_vf_link_state */ IBK_VIRTUAL_FUNCTION = 1 << 7, /* ipoib will use IB_QP_CREATE_NETDEV_USE for its QPs */ IBK_RDMA_NETDEV_OPA = 1 << 8, }; enum ib_atomic_cap { IB_ATOMIC_NONE, IB_ATOMIC_HCA, IB_ATOMIC_GLOB }; enum ib_odp_general_cap_bits { IB_ODP_SUPPORT = 1 << 0, IB_ODP_SUPPORT_IMPLICIT = 1 << 1, }; enum ib_odp_transport_cap_bits { IB_ODP_SUPPORT_SEND = 1 << 0, IB_ODP_SUPPORT_RECV = 1 << 1, IB_ODP_SUPPORT_WRITE = 1 << 2, IB_ODP_SUPPORT_READ = 1 << 3, IB_ODP_SUPPORT_ATOMIC = 1 << 4, IB_ODP_SUPPORT_SRQ_RECV = 1 << 5, }; struct ib_odp_caps { uint64_t general_caps; struct { uint32_t rc_odp_caps; uint32_t uc_odp_caps; uint32_t ud_odp_caps; uint32_t xrc_odp_caps; } per_transport_caps; }; struct ib_rss_caps { /* Corresponding bit will be set if qp type from * 'enum ib_qp_type' is supported, e.g. * supported_qpts |= 1 << IB_QPT_UD */ u32 supported_qpts; u32 max_rwq_indirection_tables; u32 max_rwq_indirection_table_size; }; enum ib_tm_cap_flags { /* Support tag matching with rendezvous offload for RC transport */ IB_TM_CAP_RNDV_RC = 1 << 0, }; struct ib_tm_caps { /* Max size of RNDV header */ u32 max_rndv_hdr_size; /* Max number of entries in tag matching list */ u32 max_num_tags; /* From enum ib_tm_cap_flags */ u32 flags; /* Max number of outstanding list operations */ u32 max_ops; /* Max number of SGE in tag matching entry */ u32 max_sge; }; struct ib_cq_init_attr { unsigned int cqe; u32 comp_vector; u32 flags; }; enum ib_cq_attr_mask { IB_CQ_MODERATE = 1 << 0, }; struct ib_cq_caps { u16 max_cq_moderation_count; u16 max_cq_moderation_period; }; struct ib_dm_mr_attr { u64 length; u64 offset; u32 access_flags; }; struct ib_dm_alloc_attr { u64 length; u32 alignment; u32 flags; }; struct ib_device_attr { u64 fw_ver; __be64 sys_image_guid; u64 max_mr_size; u64 page_size_cap; u32 vendor_id; u32 vendor_part_id; u32 hw_ver; int max_qp; int max_qp_wr; u64 device_cap_flags; u64 kernel_cap_flags; int max_send_sge; int max_recv_sge; int max_sge_rd; int max_cq; int max_cqe; int max_mr; int max_pd; int max_qp_rd_atom; int max_ee_rd_atom; int max_res_rd_atom; int max_qp_init_rd_atom; int max_ee_init_rd_atom; enum ib_atomic_cap atomic_cap; enum ib_atomic_cap masked_atomic_cap; int max_ee; int max_rdd; int max_mw; int max_raw_ipv6_qp; int max_raw_ethy_qp; int max_mcast_grp; int max_mcast_qp_attach; int max_total_mcast_qp_attach; int max_ah; int max_srq; int max_srq_wr; int max_srq_sge; unsigned int max_fast_reg_page_list_len; unsigned int max_pi_fast_reg_page_list_len; u16 max_pkeys; u8 local_ca_ack_delay; int sig_prot_cap; int sig_guard_cap; struct ib_odp_caps odp_caps; uint64_t timestamp_mask; uint64_t hca_core_clock; /* in KHZ */ struct ib_rss_caps rss_caps; u32 max_wq_type_rq; u32 raw_packet_caps; /* Use ib_raw_packet_caps enum */ struct ib_tm_caps tm_caps; struct ib_cq_caps cq_caps; u64 max_dm_size; /* Max entries for sgl for optimized performance per READ */ u32 max_sgl_rd; }; enum ib_mtu { IB_MTU_256 = 1, IB_MTU_512 = 2, IB_MTU_1024 = 3, IB_MTU_2048 = 4, IB_MTU_4096 = 5 }; enum opa_mtu { OPA_MTU_8192 = 6, OPA_MTU_10240 = 7 }; static inline int ib_mtu_enum_to_int(enum ib_mtu mtu) { switch (mtu) { case IB_MTU_256: return 256; case IB_MTU_512: return 512; case IB_MTU_1024: return 1024; case IB_MTU_2048: return 2048; case IB_MTU_4096: return 4096; default: return -1; } } static inline enum ib_mtu ib_mtu_int_to_enum(int mtu) { if (mtu >= 4096) return IB_MTU_4096; else if (mtu >= 2048) return IB_MTU_2048; else if (mtu >= 1024) return IB_MTU_1024; else if (mtu >= 512) return IB_MTU_512; else return IB_MTU_256; } static inline int opa_mtu_enum_to_int(enum opa_mtu mtu) { switch (mtu) { case OPA_MTU_8192: return 8192; case OPA_MTU_10240: return 10240; default: return(ib_mtu_enum_to_int((enum ib_mtu)mtu)); } } static inline enum opa_mtu opa_mtu_int_to_enum(int mtu) { if (mtu >= 10240) return OPA_MTU_10240; else if (mtu >= 8192) return OPA_MTU_8192; else return ((enum opa_mtu)ib_mtu_int_to_enum(mtu)); } enum ib_port_state { IB_PORT_NOP = 0, IB_PORT_DOWN = 1, IB_PORT_INIT = 2, IB_PORT_ARMED = 3, IB_PORT_ACTIVE = 4, IB_PORT_ACTIVE_DEFER = 5 }; enum ib_port_phys_state { IB_PORT_PHYS_STATE_SLEEP = 1, IB_PORT_PHYS_STATE_POLLING = 2, IB_PORT_PHYS_STATE_DISABLED = 3, IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING = 4, IB_PORT_PHYS_STATE_LINK_UP = 5, IB_PORT_PHYS_STATE_LINK_ERROR_RECOVERY = 6, IB_PORT_PHYS_STATE_PHY_TEST = 7, }; enum ib_port_width { IB_WIDTH_1X = 1, IB_WIDTH_2X = 16, IB_WIDTH_4X = 2, IB_WIDTH_8X = 4, IB_WIDTH_12X = 8 }; static inline int ib_width_enum_to_int(enum ib_port_width width) { switch (width) { case IB_WIDTH_1X: return 1; case IB_WIDTH_2X: return 2; case IB_WIDTH_4X: return 4; case IB_WIDTH_8X: return 8; case IB_WIDTH_12X: return 12; default: return -1; } } enum ib_port_speed { IB_SPEED_SDR = 1, IB_SPEED_DDR = 2, IB_SPEED_QDR = 4, IB_SPEED_FDR10 = 8, IB_SPEED_FDR = 16, IB_SPEED_EDR = 32, IB_SPEED_HDR = 64, IB_SPEED_NDR = 128, }; enum ib_stat_flag { IB_STAT_FLAG_OPTIONAL = 1 << 0, }; /** * struct rdma_stat_desc * @name - The name of the counter * @flags - Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL * @priv - Driver private information; Core code should not use */ struct rdma_stat_desc { const char *name; unsigned int flags; const void *priv; }; /** * struct rdma_hw_stats * @lock - Mutex to protect parallel write access to lifespan and values * of counters, which are 64bits and not guaranteed to be written * atomicaly on 32bits systems. * @timestamp - Used by the core code to track when the last update was * @lifespan - Used by the core code to determine how old the counters * should be before being updated again. Stored in jiffies, defaults * to 10 milliseconds, drivers can override the default be specifying * their own value during their allocation routine. * @descs - Array of pointers to static descriptors used for the counters * in directory. * @is_disabled - A bitmap to indicate each counter is currently disabled * or not. * @num_counters - How many hardware counters there are. If name is * shorter than this number, a kernel oops will result. Driver authors * are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters) * in their code to prevent this. * @value - Array of u64 counters that are accessed by the sysfs code and * filled in by the drivers get_stats routine */ struct rdma_hw_stats { struct mutex lock; /* Protect lifespan and values[] */ unsigned long timestamp; unsigned long lifespan; const struct rdma_stat_desc *descs; unsigned long *is_disabled; int num_counters; u64 value[]; }; #define RDMA_HW_STATS_DEFAULT_LIFESPAN 10 struct rdma_hw_stats *rdma_alloc_hw_stats_struct( const struct rdma_stat_desc *descs, int num_counters, unsigned long lifespan); void rdma_free_hw_stats_struct(struct rdma_hw_stats *stats); /* Define bits for the various functionality this port needs to be supported by * the core. */ /* Management 0x00000FFF */ #define RDMA_CORE_CAP_IB_MAD 0x00000001 #define RDMA_CORE_CAP_IB_SMI 0x00000002 #define RDMA_CORE_CAP_IB_CM 0x00000004 #define RDMA_CORE_CAP_IW_CM 0x00000008 #define RDMA_CORE_CAP_IB_SA 0x00000010 #define RDMA_CORE_CAP_OPA_MAD 0x00000020 /* Address format 0x000FF000 */ #define RDMA_CORE_CAP_AF_IB 0x00001000 #define RDMA_CORE_CAP_ETH_AH 0x00002000 #define RDMA_CORE_CAP_OPA_AH 0x00004000 #define RDMA_CORE_CAP_IB_GRH_REQUIRED 0x00008000 /* Protocol 0xFFF00000 */ #define RDMA_CORE_CAP_PROT_IB 0x00100000 #define RDMA_CORE_CAP_PROT_ROCE 0x00200000 #define RDMA_CORE_CAP_PROT_IWARP 0x00400000 #define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000 #define RDMA_CORE_CAP_PROT_RAW_PACKET 0x01000000 #define RDMA_CORE_CAP_PROT_USNIC 0x02000000 #define RDMA_CORE_PORT_IB_GRH_REQUIRED (RDMA_CORE_CAP_IB_GRH_REQUIRED \ | RDMA_CORE_CAP_PROT_ROCE \ | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP) #define RDMA_CORE_PORT_IBA_IB (RDMA_CORE_CAP_PROT_IB \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_SMI \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_IB_SA \ | RDMA_CORE_CAP_AF_IB) #define RDMA_CORE_PORT_IBA_ROCE (RDMA_CORE_CAP_PROT_ROCE \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_AF_IB \ | RDMA_CORE_CAP_ETH_AH) #define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP \ (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \ | RDMA_CORE_CAP_IB_MAD \ | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_AF_IB \ | RDMA_CORE_CAP_ETH_AH) #define RDMA_CORE_PORT_IWARP (RDMA_CORE_CAP_PROT_IWARP \ | RDMA_CORE_CAP_IW_CM) #define RDMA_CORE_PORT_INTEL_OPA (RDMA_CORE_PORT_IBA_IB \ | RDMA_CORE_CAP_OPA_MAD) #define RDMA_CORE_PORT_RAW_PACKET (RDMA_CORE_CAP_PROT_RAW_PACKET) #define RDMA_CORE_PORT_USNIC (RDMA_CORE_CAP_PROT_USNIC) struct ib_port_attr { u64 subnet_prefix; enum ib_port_state state; enum ib_mtu max_mtu; enum ib_mtu active_mtu; u32 phys_mtu; int gid_tbl_len; unsigned int ip_gids:1; /* This is the value from PortInfo CapabilityMask, defined by IBA */ u32 port_cap_flags; u32 max_msg_sz; u32 bad_pkey_cntr; u32 qkey_viol_cntr; u16 pkey_tbl_len; u32 sm_lid; u32 lid; u8 lmc; u8 max_vl_num; u8 sm_sl; u8 subnet_timeout; u8 init_type_reply; u8 active_width; u16 active_speed; u8 phys_state; u16 port_cap_flags2; }; enum ib_device_modify_flags { IB_DEVICE_MODIFY_SYS_IMAGE_GUID = 1 << 0, IB_DEVICE_MODIFY_NODE_DESC = 1 << 1 }; #define IB_DEVICE_NODE_DESC_MAX 64 struct ib_device_modify { u64 sys_image_guid; char node_desc[IB_DEVICE_NODE_DESC_MAX]; }; enum ib_port_modify_flags { IB_PORT_SHUTDOWN = 1, IB_PORT_INIT_TYPE = (1<<2), IB_PORT_RESET_QKEY_CNTR = (1<<3), IB_PORT_OPA_MASK_CHG = (1<<4) }; struct ib_port_modify { u32 set_port_cap_mask; u32 clr_port_cap_mask; u8 init_type; }; enum ib_event_type { IB_EVENT_CQ_ERR, IB_EVENT_QP_FATAL, IB_EVENT_QP_REQ_ERR, IB_EVENT_QP_ACCESS_ERR, IB_EVENT_COMM_EST, IB_EVENT_SQ_DRAINED, IB_EVENT_PATH_MIG, IB_EVENT_PATH_MIG_ERR, IB_EVENT_DEVICE_FATAL, IB_EVENT_PORT_ACTIVE, IB_EVENT_PORT_ERR, IB_EVENT_LID_CHANGE, IB_EVENT_PKEY_CHANGE, IB_EVENT_SM_CHANGE, IB_EVENT_SRQ_ERR, IB_EVENT_SRQ_LIMIT_REACHED, IB_EVENT_QP_LAST_WQE_REACHED, IB_EVENT_CLIENT_REREGISTER, IB_EVENT_GID_CHANGE, IB_EVENT_WQ_FATAL, }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event); struct ib_event { struct ib_device *device; union { struct ib_cq *cq; struct ib_qp *qp; struct ib_srq *srq; struct ib_wq *wq; u32 port_num; } element; enum ib_event_type event; }; struct ib_event_handler { struct ib_device *device; void (*handler)(struct ib_event_handler *, struct ib_event *); struct list_head list; }; #define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler) \ do { \ (_ptr)->device = _device; \ (_ptr)->handler = _handler; \ INIT_LIST_HEAD(&(_ptr)->list); \ } while (0) struct ib_global_route { const struct ib_gid_attr *sgid_attr; union ib_gid dgid; u32 flow_label; u8 sgid_index; u8 hop_limit; u8 traffic_class; }; struct ib_grh { __be32 version_tclass_flow; __be16 paylen; u8 next_hdr; u8 hop_limit; union ib_gid sgid; union ib_gid dgid; }; union rdma_network_hdr { struct ib_grh ibgrh; struct { /* The IB spec states that if it's IPv4, the header * is located in the last 20 bytes of the header. */ u8 reserved[20]; struct iphdr roce4grh; }; }; #define IB_QPN_MASK 0xFFFFFF enum { IB_MULTICAST_QPN = 0xffffff }; #define IB_LID_PERMISSIVE cpu_to_be16(0xFFFF) #define IB_MULTICAST_LID_BASE cpu_to_be16(0xC000) enum ib_ah_flags { IB_AH_GRH = 1 }; enum ib_rate { IB_RATE_PORT_CURRENT = 0, IB_RATE_2_5_GBPS = 2, IB_RATE_5_GBPS = 5, IB_RATE_10_GBPS = 3, IB_RATE_20_GBPS = 6, IB_RATE_30_GBPS = 4, IB_RATE_40_GBPS = 7, IB_RATE_60_GBPS = 8, IB_RATE_80_GBPS = 9, IB_RATE_120_GBPS = 10, IB_RATE_14_GBPS = 11, IB_RATE_56_GBPS = 12, IB_RATE_112_GBPS = 13, IB_RATE_168_GBPS = 14, IB_RATE_25_GBPS = 15, IB_RATE_100_GBPS = 16, IB_RATE_200_GBPS = 17, IB_RATE_300_GBPS = 18, IB_RATE_28_GBPS = 19, IB_RATE_50_GBPS = 20, IB_RATE_400_GBPS = 21, IB_RATE_600_GBPS = 22, }; /** * ib_rate_to_mult - Convert the IB rate enum to a multiple of the * base rate of 2.5 Gbit/sec. For example, IB_RATE_5_GBPS will be * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. * @rate: rate to convert. */ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate); /** * ib_rate_to_mbps - Convert the IB rate enum to Mbps. * For example, IB_RATE_2_5_GBPS will be converted to 2500. * @rate: rate to convert. */ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); /** * enum ib_mr_type - memory region type * @IB_MR_TYPE_MEM_REG: memory region that is used for * normal registration * @IB_MR_TYPE_SG_GAPS: memory region that is capable to * register any arbitrary sg lists (without * the normal mr constraints - see * ib_map_mr_sg) * @IB_MR_TYPE_DM: memory region that is used for device * memory registration * @IB_MR_TYPE_USER: memory region that is used for the user-space * application * @IB_MR_TYPE_DMA: memory region that is used for DMA operations * without address translations (VA=PA) * @IB_MR_TYPE_INTEGRITY: memory region that is used for * data integrity operations */ enum ib_mr_type { IB_MR_TYPE_MEM_REG, IB_MR_TYPE_SG_GAPS, IB_MR_TYPE_DM, IB_MR_TYPE_USER, IB_MR_TYPE_DMA, IB_MR_TYPE_INTEGRITY, }; enum ib_mr_status_check { IB_MR_CHECK_SIG_STATUS = 1, }; /** * struct ib_mr_status - Memory region status container * * @fail_status: Bitmask of MR checks status. For each * failed check a corresponding status bit is set. * @sig_err: Additional info for IB_MR_CEHCK_SIG_STATUS * failure. */ struct ib_mr_status { u32 fail_status; struct ib_sig_err sig_err; }; /** * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate * enum. * @mult: multiple to convert. */ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult); struct rdma_ah_init_attr { struct rdma_ah_attr *ah_attr; u32 flags; struct net_device *xmit_slave; }; enum rdma_ah_attr_type { RDMA_AH_ATTR_TYPE_UNDEFINED, RDMA_AH_ATTR_TYPE_IB, RDMA_AH_ATTR_TYPE_ROCE, RDMA_AH_ATTR_TYPE_OPA, }; struct ib_ah_attr { u16 dlid; u8 src_path_bits; }; struct roce_ah_attr { u8 dmac[ETH_ALEN]; }; struct opa_ah_attr { u32 dlid; u8 src_path_bits; bool make_grd; }; struct rdma_ah_attr { struct ib_global_route grh; u8 sl; u8 static_rate; u32 port_num; u8 ah_flags; enum rdma_ah_attr_type type; union { struct ib_ah_attr ib; struct roce_ah_attr roce; struct opa_ah_attr opa; }; }; enum ib_wc_status { IB_WC_SUCCESS, IB_WC_LOC_LEN_ERR, IB_WC_LOC_QP_OP_ERR, IB_WC_LOC_EEC_OP_ERR, IB_WC_LOC_PROT_ERR, IB_WC_WR_FLUSH_ERR, IB_WC_MW_BIND_ERR, IB_WC_BAD_RESP_ERR, IB_WC_LOC_ACCESS_ERR, IB_WC_REM_INV_REQ_ERR, IB_WC_REM_ACCESS_ERR, IB_WC_REM_OP_ERR, IB_WC_RETRY_EXC_ERR, IB_WC_RNR_RETRY_EXC_ERR, IB_WC_LOC_RDD_VIOL_ERR, IB_WC_REM_INV_RD_REQ_ERR, IB_WC_REM_ABORT_ERR, IB_WC_INV_EECN_ERR, IB_WC_INV_EEC_STATE_ERR, IB_WC_FATAL_ERR, IB_WC_RESP_TIMEOUT_ERR, IB_WC_GENERAL_ERR }; const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status); enum ib_wc_opcode { IB_WC_SEND = IB_UVERBS_WC_SEND, IB_WC_RDMA_WRITE = IB_UVERBS_WC_RDMA_WRITE, IB_WC_RDMA_READ = IB_UVERBS_WC_RDMA_READ, IB_WC_COMP_SWAP = IB_UVERBS_WC_COMP_SWAP, IB_WC_FETCH_ADD = IB_UVERBS_WC_FETCH_ADD, IB_WC_BIND_MW = IB_UVERBS_WC_BIND_MW, IB_WC_LOCAL_INV = IB_UVERBS_WC_LOCAL_INV, IB_WC_LSO = IB_UVERBS_WC_TSO, IB_WC_ATOMIC_WRITE = IB_UVERBS_WC_ATOMIC_WRITE, IB_WC_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, IB_WC_FLUSH = IB_UVERBS_WC_FLUSH, /* * Set value of IB_WC_RECV so consumers can test if a completion is a * receive by testing (opcode & IB_WC_RECV). */ IB_WC_RECV = 1 << 7, IB_WC_RECV_RDMA_WITH_IMM }; enum ib_wc_flags { IB_WC_GRH = 1, IB_WC_WITH_IMM = (1<<1), IB_WC_WITH_INVALIDATE = (1<<2), IB_WC_IP_CSUM_OK = (1<<3), IB_WC_WITH_SMAC = (1<<4), IB_WC_WITH_VLAN = (1<<5), IB_WC_WITH_NETWORK_HDR_TYPE = (1<<6), }; struct ib_wc { union { u64 wr_id; struct ib_cqe *wr_cqe; }; enum ib_wc_status status; enum ib_wc_opcode opcode; u32 vendor_err; u32 byte_len; struct ib_qp *qp; union { __be32 imm_data; u32 invalidate_rkey; } ex; u32 src_qp; u32 slid; int wc_flags; u16 pkey_index; u8 sl; u8 dlid_path_bits; u32 port_num; /* valid only for DR SMPs on switches */ u8 smac[ETH_ALEN]; u16 vlan_id; u8 network_hdr_type; }; enum ib_cq_notify_flags { IB_CQ_SOLICITED = 1 << 0, IB_CQ_NEXT_COMP = 1 << 1, IB_CQ_SOLICITED_MASK = IB_CQ_SOLICITED | IB_CQ_NEXT_COMP, IB_CQ_REPORT_MISSED_EVENTS = 1 << 2, }; enum ib_srq_type { IB_SRQT_BASIC = IB_UVERBS_SRQT_BASIC, IB_SRQT_XRC = IB_UVERBS_SRQT_XRC, IB_SRQT_TM = IB_UVERBS_SRQT_TM, }; static inline bool ib_srq_has_cq(enum ib_srq_type srq_type) { return srq_type == IB_SRQT_XRC || srq_type == IB_SRQT_TM; } enum ib_srq_attr_mask { IB_SRQ_MAX_WR = 1 << 0, IB_SRQ_LIMIT = 1 << 1, }; struct ib_srq_attr { u32 max_wr; u32 max_sge; u32 srq_limit; }; struct ib_srq_init_attr { void (*event_handler)(struct ib_event *, void *); void *srq_context; struct ib_srq_attr attr; enum ib_srq_type srq_type; struct { struct ib_cq *cq; union { struct { struct ib_xrcd *xrcd; } xrc; struct { u32 max_num_tags; } tag_matching; }; } ext; }; struct ib_qp_cap { u32 max_send_wr; u32 max_recv_wr; u32 max_send_sge; u32 max_recv_sge; u32 max_inline_data; /* * Maximum number of rdma_rw_ctx structures in flight at a time. * ib_create_qp() will calculate the right amount of neededed WRs * and MRs based on this. */ u32 max_rdma_ctxs; }; enum ib_sig_type { IB_SIGNAL_ALL_WR, IB_SIGNAL_REQ_WR }; enum ib_qp_type { /* * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries * here (and in that order) since the MAD layer uses them as * indices into a 2-entry table. */ IB_QPT_SMI, IB_QPT_GSI, IB_QPT_RC = IB_UVERBS_QPT_RC, IB_QPT_UC = IB_UVERBS_QPT_UC, IB_QPT_UD = IB_UVERBS_QPT_UD, IB_QPT_RAW_IPV6, IB_QPT_RAW_ETHERTYPE, IB_QPT_RAW_PACKET = IB_UVERBS_QPT_RAW_PACKET, IB_QPT_XRC_INI = IB_UVERBS_QPT_XRC_INI, IB_QPT_XRC_TGT = IB_UVERBS_QPT_XRC_TGT, IB_QPT_MAX, IB_QPT_DRIVER = IB_UVERBS_QPT_DRIVER, /* Reserve a range for qp types internal to the low level driver. * These qp types will not be visible at the IB core layer, so the * IB_QPT_MAX usages should not be affected in the core layer */ IB_QPT_RESERVED1 = 0x1000, IB_QPT_RESERVED2, IB_QPT_RESERVED3, IB_QPT_RESERVED4, IB_QPT_RESERVED5, IB_QPT_RESERVED6, IB_QPT_RESERVED7, IB_QPT_RESERVED8, IB_QPT_RESERVED9, IB_QPT_RESERVED10, }; enum ib_qp_create_flags { IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, IB_QP_CREATE_CROSS_CHANNEL = 1 << 2, IB_QP_CREATE_MANAGED_SEND = 1 << 3, IB_QP_CREATE_MANAGED_RECV = 1 << 4, IB_QP_CREATE_NETIF_QP = 1 << 5, IB_QP_CREATE_INTEGRITY_EN = 1 << 6, IB_QP_CREATE_NETDEV_USE = 1 << 7, IB_QP_CREATE_SCATTER_FCS = IB_UVERBS_QP_CREATE_SCATTER_FCS, IB_QP_CREATE_CVLAN_STRIPPING = IB_UVERBS_QP_CREATE_CVLAN_STRIPPING, IB_QP_CREATE_SOURCE_QPN = 1 << 10, IB_QP_CREATE_PCI_WRITE_END_PADDING = IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, }; /* * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler * callback to destroy the passed in QP. */ struct ib_qp_init_attr { /* This callback occurs in workqueue context */ void (*event_handler)(struct ib_event *, void *); void *qp_context; struct ib_cq *send_cq; struct ib_cq *recv_cq; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct ib_qp_cap cap; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; u32 create_flags; /* * Only needed for special QP types, or when using the RW API. */ u32 port_num; struct ib_rwq_ind_table *rwq_ind_tbl; u32 source_qpn; }; struct ib_qp_open_attr { void (*event_handler)(struct ib_event *, void *); void *qp_context; u32 qp_num; enum ib_qp_type qp_type; }; enum ib_rnr_timeout { IB_RNR_TIMER_655_36 = 0, IB_RNR_TIMER_000_01 = 1, IB_RNR_TIMER_000_02 = 2, IB_RNR_TIMER_000_03 = 3, IB_RNR_TIMER_000_04 = 4, IB_RNR_TIMER_000_06 = 5, IB_RNR_TIMER_000_08 = 6, IB_RNR_TIMER_000_12 = 7, IB_RNR_TIMER_000_16 = 8, IB_RNR_TIMER_000_24 = 9, IB_RNR_TIMER_000_32 = 10, IB_RNR_TIMER_000_48 = 11, IB_RNR_TIMER_000_64 = 12, IB_RNR_TIMER_000_96 = 13, IB_RNR_TIMER_001_28 = 14, IB_RNR_TIMER_001_92 = 15, IB_RNR_TIMER_002_56 = 16, IB_RNR_TIMER_003_84 = 17, IB_RNR_TIMER_005_12 = 18, IB_RNR_TIMER_007_68 = 19, IB_RNR_TIMER_010_24 = 20, IB_RNR_TIMER_015_36 = 21, IB_RNR_TIMER_020_48 = 22, IB_RNR_TIMER_030_72 = 23, IB_RNR_TIMER_040_96 = 24, IB_RNR_TIMER_061_44 = 25, IB_RNR_TIMER_081_92 = 26, IB_RNR_TIMER_122_88 = 27, IB_RNR_TIMER_163_84 = 28, IB_RNR_TIMER_245_76 = 29, IB_RNR_TIMER_327_68 = 30, IB_RNR_TIMER_491_52 = 31 }; enum ib_qp_attr_mask { IB_QP_STATE = 1, IB_QP_CUR_STATE = (1<<1), IB_QP_EN_SQD_ASYNC_NOTIFY = (1<<2), IB_QP_ACCESS_FLAGS = (1<<3), IB_QP_PKEY_INDEX = (1<<4), IB_QP_PORT = (1<<5), IB_QP_QKEY = (1<<6), IB_QP_AV = (1<<7), IB_QP_PATH_MTU = (1<<8), IB_QP_TIMEOUT = (1<<9), IB_QP_RETRY_CNT = (1<<10), IB_QP_RNR_RETRY = (1<<11), IB_QP_RQ_PSN = (1<<12), IB_QP_MAX_QP_RD_ATOMIC = (1<<13), IB_QP_ALT_PATH = (1<<14), IB_QP_MIN_RNR_TIMER = (1<<15), IB_QP_SQ_PSN = (1<<16), IB_QP_MAX_DEST_RD_ATOMIC = (1<<17), IB_QP_PATH_MIG_STATE = (1<<18), IB_QP_CAP = (1<<19), IB_QP_DEST_QPN = (1<<20), IB_QP_RESERVED1 = (1<<21), IB_QP_RESERVED2 = (1<<22), IB_QP_RESERVED3 = (1<<23), IB_QP_RESERVED4 = (1<<24), IB_QP_RATE_LIMIT = (1<<25), IB_QP_ATTR_STANDARD_BITS = GENMASK(20, 0), }; enum ib_qp_state { IB_QPS_RESET, IB_QPS_INIT, IB_QPS_RTR, IB_QPS_RTS, IB_QPS_SQD, IB_QPS_SQE, IB_QPS_ERR }; enum ib_mig_state { IB_MIG_MIGRATED, IB_MIG_REARM, IB_MIG_ARMED }; enum ib_mw_type { IB_MW_TYPE_1 = 1, IB_MW_TYPE_2 = 2 }; struct ib_qp_attr { enum ib_qp_state qp_state; enum ib_qp_state cur_qp_state; enum ib_mtu path_mtu; enum ib_mig_state path_mig_state; u32 qkey; u32 rq_psn; u32 sq_psn; u32 dest_qp_num; int qp_access_flags; struct ib_qp_cap cap; struct rdma_ah_attr ah_attr; struct rdma_ah_attr alt_ah_attr; u16 pkey_index; u16 alt_pkey_index; u8 en_sqd_async_notify; u8 sq_draining; u8 max_rd_atomic; u8 max_dest_rd_atomic; u8 min_rnr_timer; u32 port_num; u8 timeout; u8 retry_cnt; u8 rnr_retry; u32 alt_port_num; u8 alt_timeout; u32 rate_limit; struct net_device *xmit_slave; }; enum ib_wr_opcode { /* These are shared with userspace */ IB_WR_RDMA_WRITE = IB_UVERBS_WR_RDMA_WRITE, IB_WR_RDMA_WRITE_WITH_IMM = IB_UVERBS_WR_RDMA_WRITE_WITH_IMM, IB_WR_SEND = IB_UVERBS_WR_SEND, IB_WR_SEND_WITH_IMM = IB_UVERBS_WR_SEND_WITH_IMM, IB_WR_RDMA_READ = IB_UVERBS_WR_RDMA_READ, IB_WR_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_ATOMIC_CMP_AND_SWP, IB_WR_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD, IB_WR_BIND_MW = IB_UVERBS_WR_BIND_MW, IB_WR_LSO = IB_UVERBS_WR_TSO, IB_WR_SEND_WITH_INV = IB_UVERBS_WR_SEND_WITH_INV, IB_WR_RDMA_READ_WITH_INV = IB_UVERBS_WR_RDMA_READ_WITH_INV, IB_WR_LOCAL_INV = IB_UVERBS_WR_LOCAL_INV, IB_WR_MASKED_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD, IB_WR_FLUSH = IB_UVERBS_WR_FLUSH, IB_WR_ATOMIC_WRITE = IB_UVERBS_WR_ATOMIC_WRITE, /* These are kernel only and can not be issued by userspace */ IB_WR_REG_MR = 0x20, IB_WR_REG_MR_INTEGRITY, /* reserve values for low level drivers' internal use. * These values will not be used at all in the ib core layer. */ IB_WR_RESERVED1 = 0xf0, IB_WR_RESERVED2, IB_WR_RESERVED3, IB_WR_RESERVED4, IB_WR_RESERVED5, IB_WR_RESERVED6, IB_WR_RESERVED7, IB_WR_RESERVED8, IB_WR_RESERVED9, IB_WR_RESERVED10, }; enum ib_send_flags { IB_SEND_FENCE = 1, IB_SEND_SIGNALED = (1<<1), IB_SEND_SOLICITED = (1<<2), IB_SEND_INLINE = (1<<3), IB_SEND_IP_CSUM = (1<<4), /* reserve bits 26-31 for low level drivers' internal use */ IB_SEND_RESERVED_START = (1 << 26), IB_SEND_RESERVED_END = (1 << 31), }; struct ib_sge { u64 addr; u32 length; u32 lkey; }; struct ib_cqe { void (*done)(struct ib_cq *cq, struct ib_wc *wc); }; struct ib_send_wr { struct ib_send_wr *next; union { u64 wr_id; struct ib_cqe *wr_cqe; }; struct ib_sge *sg_list; int num_sge; enum ib_wr_opcode opcode; int send_flags; union { __be32 imm_data; u32 invalidate_rkey; } ex; }; struct ib_rdma_wr { struct ib_send_wr wr; u64 remote_addr; u32 rkey; }; static inline const struct ib_rdma_wr *rdma_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_rdma_wr, wr); } struct ib_atomic_wr { struct ib_send_wr wr; u64 remote_addr; u64 compare_add; u64 swap; u64 compare_add_mask; u64 swap_mask; u32 rkey; }; static inline const struct ib_atomic_wr *atomic_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_atomic_wr, wr); } struct ib_ud_wr { struct ib_send_wr wr; struct ib_ah *ah; void *header; int hlen; int mss; u32 remote_qpn; u32 remote_qkey; u16 pkey_index; /* valid for GSI only */ u32 port_num; /* valid for DR SMPs on switch only */ }; static inline const struct ib_ud_wr *ud_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_ud_wr, wr); } struct ib_reg_wr { struct ib_send_wr wr; struct ib_mr *mr; u32 key; int access; }; static inline const struct ib_reg_wr *reg_wr(const struct ib_send_wr *wr) { return container_of(wr, struct ib_reg_wr, wr); } struct ib_recv_wr { struct ib_recv_wr *next; union { u64 wr_id; struct ib_cqe *wr_cqe; }; struct ib_sge *sg_list; int num_sge; }; enum ib_access_flags { IB_ACCESS_LOCAL_WRITE = IB_UVERBS_ACCESS_LOCAL_WRITE, IB_ACCESS_REMOTE_WRITE = IB_UVERBS_ACCESS_REMOTE_WRITE, IB_ACCESS_REMOTE_READ = IB_UVERBS_ACCESS_REMOTE_READ, IB_ACCESS_REMOTE_ATOMIC = IB_UVERBS_ACCESS_REMOTE_ATOMIC, IB_ACCESS_MW_BIND = IB_UVERBS_ACCESS_MW_BIND, IB_ZERO_BASED = IB_UVERBS_ACCESS_ZERO_BASED, IB_ACCESS_ON_DEMAND = IB_UVERBS_ACCESS_ON_DEMAND, IB_ACCESS_HUGETLB = IB_UVERBS_ACCESS_HUGETLB, IB_ACCESS_RELAXED_ORDERING = IB_UVERBS_ACCESS_RELAXED_ORDERING, IB_ACCESS_FLUSH_GLOBAL = IB_UVERBS_ACCESS_FLUSH_GLOBAL, IB_ACCESS_FLUSH_PERSISTENT = IB_UVERBS_ACCESS_FLUSH_PERSISTENT, IB_ACCESS_OPTIONAL = IB_UVERBS_ACCESS_OPTIONAL_RANGE, IB_ACCESS_SUPPORTED = ((IB_ACCESS_FLUSH_PERSISTENT << 1) - 1) | IB_ACCESS_OPTIONAL, }; /* * XXX: these are apparently used for ->rereg_user_mr, no idea why they * are hidden here instead of a uapi header! */ enum ib_mr_rereg_flags { IB_MR_REREG_TRANS = 1, IB_MR_REREG_PD = (1<<1), IB_MR_REREG_ACCESS = (1<<2), IB_MR_REREG_SUPPORTED = ((IB_MR_REREG_ACCESS << 1) - 1) }; struct ib_umem; enum rdma_remove_reason { /* * Userspace requested uobject deletion or initial try * to remove uobject via cleanup. Call could fail */ RDMA_REMOVE_DESTROY, /* Context deletion. This call should delete the actual object itself */ RDMA_REMOVE_CLOSE, /* Driver is being hot-unplugged. This call should delete the actual object itself */ RDMA_REMOVE_DRIVER_REMOVE, /* uobj is being cleaned-up before being committed */ RDMA_REMOVE_ABORT, /* The driver failed to destroy the uobject and is being disconnected */ RDMA_REMOVE_DRIVER_FAILURE, }; struct ib_rdmacg_object { #ifdef CONFIG_CGROUP_RDMA struct rdma_cgroup *cg; /* owner rdma cgroup */ #endif }; struct ib_ucontext { struct ib_device *device; struct ib_uverbs_file *ufile; struct ib_rdmacg_object cg_obj; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; struct xarray mmap_xa; }; struct ib_uobject { u64 user_handle; /* handle given to us by userspace */ /* ufile & ucontext owning this object */ struct ib_uverbs_file *ufile; /* FIXME, save memory: ufile->context == context */ struct ib_ucontext *context; /* associated user context */ void *object; /* containing object */ struct list_head list; /* link to context's list */ struct ib_rdmacg_object cg_obj; /* rdmacg object */ int id; /* index into kernel idr */ struct kref ref; atomic_t usecnt; /* protects exclusive access */ struct rcu_head rcu; /* kfree_rcu() overhead */ const struct uverbs_api_object *uapi_object; }; struct ib_udata { const void __user *inbuf; void __user *outbuf; size_t inlen; size_t outlen; }; struct ib_pd { u32 local_dma_lkey; u32 flags; struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; /* count all resources */ u32 unsafe_global_rkey; /* * Implementation details of the RDMA core, don't use in drivers: */ struct ib_mr *__internal_mr; struct rdma_restrack_entry res; }; struct ib_xrcd { struct ib_device *device; atomic_t usecnt; /* count all exposed resources */ struct inode *inode; struct rw_semaphore tgt_qps_rwsem; struct xarray tgt_qps; }; struct ib_ah { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; const struct ib_gid_attr *sgid_attr; enum rdma_ah_attr_type type; }; typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); enum ib_poll_context { IB_POLL_SOFTIRQ, /* poll from softirq context */ IB_POLL_WORKQUEUE, /* poll from workqueue */ IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */ IB_POLL_LAST_POOL_TYPE = IB_POLL_UNBOUND_WORKQUEUE, IB_POLL_DIRECT, /* caller context, no hw completions */ }; struct ib_cq { struct ib_device *device; struct ib_ucq_object *uobject; ib_comp_handler comp_handler; void (*event_handler)(struct ib_event *, void *); void *cq_context; int cqe; unsigned int cqe_used; atomic_t usecnt; /* count number of work queues */ enum ib_poll_context poll_ctx; struct ib_wc *wc; struct list_head pool_entry; union { struct irq_poll iop; struct work_struct work; }; struct workqueue_struct *comp_wq; struct dim *dim; /* updated only by trace points */ ktime_t timestamp; u8 interrupt:1; u8 shared:1; unsigned int comp_vector; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; }; struct ib_srq { struct ib_device *device; struct ib_pd *pd; struct ib_usrq_object *uobject; void (*event_handler)(struct ib_event *, void *); void *srq_context; enum ib_srq_type srq_type; atomic_t usecnt; struct { struct ib_cq *cq; union { struct { struct ib_xrcd *xrcd; u32 srq_num; } xrc; }; } ext; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; }; enum ib_raw_packet_caps { /* * Strip cvlan from incoming packet and report it in the matching work * completion is supported. */ IB_RAW_PACKET_CAP_CVLAN_STRIPPING = IB_UVERBS_RAW_PACKET_CAP_CVLAN_STRIPPING, /* * Scatter FCS field of an incoming packet to host memory is supported. */ IB_RAW_PACKET_CAP_SCATTER_FCS = IB_UVERBS_RAW_PACKET_CAP_SCATTER_FCS, /* Checksum offloads are supported (for both send and receive). */ IB_RAW_PACKET_CAP_IP_CSUM = IB_UVERBS_RAW_PACKET_CAP_IP_CSUM, /* * When a packet is received for an RQ with no receive WQEs, the * packet processing is delayed. */ IB_RAW_PACKET_CAP_DELAY_DROP = IB_UVERBS_RAW_PACKET_CAP_DELAY_DROP, }; enum ib_wq_type { IB_WQT_RQ = IB_UVERBS_WQT_RQ, }; enum ib_wq_state { IB_WQS_RESET, IB_WQS_RDY, IB_WQS_ERR }; struct ib_wq { struct ib_device *device; struct ib_uwq_object *uobject; void *wq_context; void (*event_handler)(struct ib_event *, void *); struct ib_pd *pd; struct ib_cq *cq; u32 wq_num; enum ib_wq_state state; enum ib_wq_type wq_type; atomic_t usecnt; }; enum ib_wq_flags { IB_WQ_FLAGS_CVLAN_STRIPPING = IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING, IB_WQ_FLAGS_SCATTER_FCS = IB_UVERBS_WQ_FLAGS_SCATTER_FCS, IB_WQ_FLAGS_DELAY_DROP = IB_UVERBS_WQ_FLAGS_DELAY_DROP, IB_WQ_FLAGS_PCI_WRITE_END_PADDING = IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING, }; struct ib_wq_init_attr { void *wq_context; enum ib_wq_type wq_type; u32 max_wr; u32 max_sge; struct ib_cq *cq; void (*event_handler)(struct ib_event *, void *); u32 create_flags; /* Use enum ib_wq_flags */ }; enum ib_wq_attr_mask { IB_WQ_STATE = 1 << 0, IB_WQ_CUR_STATE = 1 << 1, IB_WQ_FLAGS = 1 << 2, }; struct ib_wq_attr { enum ib_wq_state wq_state; enum ib_wq_state curr_wq_state; u32 flags; /* Use enum ib_wq_flags */ u32 flags_mask; /* Use enum ib_wq_flags */ }; struct ib_rwq_ind_table { struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; u32 ind_tbl_num; u32 log_ind_tbl_size; struct ib_wq **ind_tbl; }; struct ib_rwq_ind_table_init_attr { u32 log_ind_tbl_size; /* Each entry is a pointer to Receive Work Queue */ struct ib_wq **ind_tbl; }; enum port_pkey_state { IB_PORT_PKEY_NOT_VALID = 0, IB_PORT_PKEY_VALID = 1, IB_PORT_PKEY_LISTED = 2, }; struct ib_qp_security; struct ib_port_pkey { enum port_pkey_state state; u16 pkey_index; u32 port_num; struct list_head qp_list; struct list_head to_error_list; struct ib_qp_security *sec; }; struct ib_ports_pkeys { struct ib_port_pkey main; struct ib_port_pkey alt; }; struct ib_qp_security { struct ib_qp *qp; struct ib_device *dev; /* Hold this mutex when changing port and pkey settings. */ struct mutex mutex; struct ib_ports_pkeys *ports_pkeys; /* A list of all open shared QP handles. Required to enforce security * properly for all users of a shared QP. */ struct list_head shared_qp_list; void *security; bool destroying; atomic_t error_list_count; struct completion error_complete; int error_comps_pending; }; /* * @max_write_sge: Maximum SGE elements per RDMA WRITE request. * @max_read_sge: Maximum SGE elements per RDMA READ request. */ struct ib_qp { struct ib_device *device; struct ib_pd *pd; struct ib_cq *send_cq; struct ib_cq *recv_cq; spinlock_t mr_lock; int mrs_used; struct list_head rdma_mrs; struct list_head sig_mrs; struct ib_srq *srq; struct ib_xrcd *xrcd; /* XRC TGT QPs only */ struct list_head xrcd_list; /* count times opened, mcast attaches, flow attaches */ atomic_t usecnt; struct list_head open_list; struct ib_qp *real_qp; struct ib_uqp_object *uobject; void (*event_handler)(struct ib_event *, void *); void *qp_context; /* sgid_attrs associated with the AV's */ const struct ib_gid_attr *av_sgid_attr; const struct ib_gid_attr *alt_path_sgid_attr; u32 qp_num; u32 max_write_sge; u32 max_read_sge; enum ib_qp_type qp_type; struct ib_rwq_ind_table *rwq_ind_tbl; struct ib_qp_security *qp_sec; u32 port; bool integrity_en; /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; /* The counter the qp is bind to */ struct rdma_counter *counter; }; struct ib_dm { struct ib_device *device; u32 length; u32 flags; struct ib_uobject *uobject; atomic_t usecnt; }; struct ib_mr { struct ib_device *device; struct ib_pd *pd; u32 lkey; u32 rkey; u64 iova; u64 length; unsigned int page_size; enum ib_mr_type type; bool need_inval; union { struct ib_uobject *uobject; /* user */ struct list_head qp_entry; /* FR */ }; struct ib_dm *dm; struct ib_sig_attrs *sig_attrs; /* only for IB_MR_TYPE_INTEGRITY MRs */ /* * Implementation details of the RDMA core, don't use in drivers: */ struct rdma_restrack_entry res; }; struct ib_mw { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; u32 rkey; enum ib_mw_type type; }; /* Supported steering options */ enum ib_flow_attr_type { /* steering according to rule specifications */ IB_FLOW_ATTR_NORMAL = 0x0, /* default unicast and multicast rule - * receive all Eth traffic which isn't steered to any QP */ IB_FLOW_ATTR_ALL_DEFAULT = 0x1, /* default multicast rule - * receive all Eth multicast traffic which isn't steered to any QP */ IB_FLOW_ATTR_MC_DEFAULT = 0x2, /* sniffer rule - receive all port traffic */ IB_FLOW_ATTR_SNIFFER = 0x3 }; /* Supported steering header types */ enum ib_flow_spec_type { /* L2 headers*/ IB_FLOW_SPEC_ETH = 0x20, IB_FLOW_SPEC_IB = 0x22, /* L3 header*/ IB_FLOW_SPEC_IPV4 = 0x30, IB_FLOW_SPEC_IPV6 = 0x31, IB_FLOW_SPEC_ESP = 0x34, /* L4 headers*/ IB_FLOW_SPEC_TCP = 0x40, IB_FLOW_SPEC_UDP = 0x41, IB_FLOW_SPEC_VXLAN_TUNNEL = 0x50, IB_FLOW_SPEC_GRE = 0x51, IB_FLOW_SPEC_MPLS = 0x60, IB_FLOW_SPEC_INNER = 0x100, /* Actions */ IB_FLOW_SPEC_ACTION_TAG = 0x1000, IB_FLOW_SPEC_ACTION_DROP = 0x1001, IB_FLOW_SPEC_ACTION_HANDLE = 0x1002, IB_FLOW_SPEC_ACTION_COUNT = 0x1003, }; #define IB_FLOW_SPEC_LAYER_MASK 0xF0 #define IB_FLOW_SPEC_SUPPORT_LAYERS 10 enum ib_flow_flags { IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */ IB_FLOW_ATTR_FLAGS_EGRESS = 1UL << 2, /* Egress flow */ IB_FLOW_ATTR_FLAGS_RESERVED = 1UL << 3 /* Must be last */ }; struct ib_flow_eth_filter { u8 dst_mac[6]; u8 src_mac[6]; __be16 ether_type; __be16 vlan_tag; /* Must be last */ u8 real_sz[]; }; struct ib_flow_spec_eth { u32 type; u16 size; struct ib_flow_eth_filter val; struct ib_flow_eth_filter mask; }; struct ib_flow_ib_filter { __be16 dlid; __u8 sl; /* Must be last */ u8 real_sz[]; }; struct ib_flow_spec_ib { u32 type; u16 size; struct ib_flow_ib_filter val; struct ib_flow_ib_filter mask; }; /* IPv4 header flags */ enum ib_ipv4_flags { IB_IPV4_DONT_FRAG = 0x2, /* Don't enable packet fragmentation */ IB_IPV4_MORE_FRAG = 0X4 /* For All fragmented packets except the last have this flag set */ }; struct ib_flow_ipv4_filter { __be32 src_ip; __be32 dst_ip; u8 proto; u8 tos; u8 ttl; u8 flags; /* Must be last */ u8 real_sz[]; }; struct ib_flow_spec_ipv4 { u32 type; u16 size; struct ib_flow_ipv4_filter val; struct ib_flow_ipv4_filter mask; }; struct ib_flow_ipv6_filter { u8 src_ip[16]; u8 dst_ip[16]; __be32 flow_label; u8 next_hdr; u8 traffic_class; u8 hop_limit; /* Must be last */ u8 real_sz[]; }; struct ib_flow_spec_ipv6 { u32 type; u16 size; struct ib_flow_ipv6_filter val; struct ib_flow_ipv6_filter mask; }; struct ib_flow_tcp_udp_filter { __be16 dst_port; __be16 src_port; /* Must be last */ u8 real_sz[]; }; struct ib_flow_spec_tcp_udp { u32 type; u16 size; struct ib_flow_tcp_udp_filter val; struct ib_flow_tcp_udp_filter mask; }; struct ib_flow_tunnel_filter { __be32 tunnel_id; u8 real_sz[]; }; /* ib_flow_spec_tunnel describes the Vxlan tunnel * the tunnel_id from val has the vni value */ struct ib_flow_spec_tunnel { u32 type; u16 size; struct ib_flow_tunnel_filter val; struct ib_flow_tunnel_filter mask; }; struct ib_flow_esp_filter { __be32 spi; __be32 seq; /* Must be last */ u8 real_sz[]; }; struct ib_flow_spec_esp { u32 type; u16 size; struct ib_flow_esp_filter val; struct ib_flow_esp_filter mask; }; struct ib_flow_gre_filter { __be16 c_ks_res0_ver; __be16 protocol; __be32 key; /* Must be last */ u8 real_sz[]; }; struct ib_flow_spec_gre { u32 type; u16 size; struct ib_flow_gre_filter val; struct ib_flow_gre_filter mask; }; struct ib_flow_mpls_filter { __be32 tag; /* Must be last */ u8 real_sz[]; }; struct ib_flow_spec_mpls { u32 type; u16 size; struct ib_flow_mpls_filter val; struct ib_flow_mpls_filter mask; }; struct ib_flow_spec_action_tag { enum ib_flow_spec_type type; u16 size; u32 tag_id; }; struct ib_flow_spec_action_drop { enum ib_flow_spec_type type; u16 size; }; struct ib_flow_spec_action_handle { enum ib_flow_spec_type type; u16 size; struct ib_flow_action *act; }; enum ib_counters_description { IB_COUNTER_PACKETS, IB_COUNTER_BYTES, }; struct ib_flow_spec_action_count { enum ib_flow_spec_type type; u16 size; struct ib_counters *counters; }; union ib_flow_spec { struct { u32 type; u16 size; }; struct ib_flow_spec_eth eth; struct ib_flow_spec_ib ib; struct ib_flow_spec_ipv4 ipv4; struct ib_flow_spec_tcp_udp tcp_udp; struct ib_flow_spec_ipv6 ipv6; struct ib_flow_spec_tunnel tunnel; struct ib_flow_spec_esp esp; struct ib_flow_spec_gre gre; struct ib_flow_spec_mpls mpls; struct ib_flow_spec_action_tag flow_tag; struct ib_flow_spec_action_drop drop; struct ib_flow_spec_action_handle action; struct ib_flow_spec_action_count flow_count; }; struct ib_flow_attr { enum ib_flow_attr_type type; u16 size; u16 priority; u32 flags; u8 num_of_specs; u32 port; union ib_flow_spec flows[]; }; struct ib_flow { struct ib_qp *qp; struct ib_device *device; struct ib_uobject *uobject; }; enum ib_flow_action_type { IB_FLOW_ACTION_UNSPECIFIED, IB_FLOW_ACTION_ESP = 1, }; struct ib_flow_action_attrs_esp_keymats { enum ib_uverbs_flow_action_esp_keymat protocol; union { struct ib_uverbs_flow_action_esp_keymat_aes_gcm aes_gcm; } keymat; }; struct ib_flow_action_attrs_esp_replays { enum ib_uverbs_flow_action_esp_replay protocol; union { struct ib_uverbs_flow_action_esp_replay_bmp bmp; } replay; }; enum ib_flow_action_attrs_esp_flags { /* All user-space flags at the top: Use enum ib_uverbs_flow_action_esp_flags * This is done in order to share the same flags between user-space and * kernel and spare an unnecessary translation. */ /* Kernel flags */ IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED = 1ULL << 32, IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS = 1ULL << 33, }; struct ib_flow_spec_list { struct ib_flow_spec_list *next; union ib_flow_spec spec; }; struct ib_flow_action_attrs_esp { struct ib_flow_action_attrs_esp_keymats *keymat; struct ib_flow_action_attrs_esp_replays *replay; struct ib_flow_spec_list *encap; /* Used only if IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED is enabled. * Value of 0 is a valid value. */ u32 esn; u32 spi; u32 seq; u32 tfc_pad; /* Use enum ib_flow_action_attrs_esp_flags */ u64 flags; u64 hard_limit_pkts; }; struct ib_flow_action { struct ib_device *device; struct ib_uobject *uobject; enum ib_flow_action_type type; atomic_t usecnt; }; struct ib_mad; enum ib_process_mad_flags { IB_MAD_IGNORE_MKEY = 1, IB_MAD_IGNORE_BKEY = 2, IB_MAD_IGNORE_ALL = IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY }; enum ib_mad_result { IB_MAD_RESULT_FAILURE = 0, /* (!SUCCESS is the important flag) */ IB_MAD_RESULT_SUCCESS = 1 << 0, /* MAD was successfully processed */ IB_MAD_RESULT_REPLY = 1 << 1, /* Reply packet needs to be sent */ IB_MAD_RESULT_CONSUMED = 1 << 2 /* Packet consumed: stop processing */ }; struct ib_port_cache { u64 subnet_prefix; struct ib_pkey_cache *pkey; struct ib_gid_table *gid; u8 lmc; enum ib_port_state port_state; }; struct ib_port_immutable { int pkey_tbl_len; int gid_tbl_len; u32 core_cap_flags; u32 max_mad_size; }; struct ib_port_data { struct ib_device *ib_dev; struct ib_port_immutable immutable; spinlock_t pkey_list_lock; spinlock_t netdev_lock; struct list_head pkey_list; struct ib_port_cache cache; struct net_device __rcu *netdev; netdevice_tracker netdev_tracker; struct hlist_node ndev_hash_link; struct rdma_port_counter port_counter; struct ib_port *sysfs; }; /* rdma netdev type - specifies protocol type */ enum rdma_netdev_t { RDMA_NETDEV_OPA_VNIC, RDMA_NETDEV_IPOIB, }; /** * struct rdma_netdev - rdma netdev * For cases where netstack interfacing is required. */ struct rdma_netdev { void *clnt_priv; struct ib_device *hca; u32 port_num; int mtu; /* * cleanup function must be specified. * FIXME: This is only used for OPA_VNIC and that usage should be * removed too. */ void (*free_rdma_netdev)(struct net_device *netdev); /* control functions */ void (*set_id)(struct net_device *netdev, int id); /* send packet */ int (*send)(struct net_device *dev, struct sk_buff *skb, struct ib_ah *address, u32 dqpn); /* multicast */ int (*attach_mcast)(struct net_device *dev, struct ib_device *hca, union ib_gid *gid, u16 mlid, int set_qkey, u32 qkey); int (*detach_mcast)(struct net_device *dev, struct ib_device *hca, union ib_gid *gid, u16 mlid); /* timeout */ void (*tx_timeout)(struct net_device *dev, unsigned int txqueue); }; struct rdma_netdev_alloc_params { size_t sizeof_priv; unsigned int txqs; unsigned int rxqs; void *param; int (*initialize_rdma_netdev)(struct ib_device *device, u32 port_num, struct net_device *netdev, void *param); }; struct ib_odp_counters { atomic64_t faults; atomic64_t invalidations; atomic64_t prefetch; }; struct ib_counters { struct ib_device *device; struct ib_uobject *uobject; /* num of objects attached */ atomic_t usecnt; }; struct ib_counters_read_attr { u64 *counters_buff; u32 ncounters; u32 flags; /* use enum ib_read_counters_flags */ }; struct uverbs_attr_bundle; struct iw_cm_id; struct iw_cm_conn_param; #define INIT_RDMA_OBJ_SIZE(ib_struct, drv_struct, member) \ .size_##ib_struct = \ (sizeof(struct drv_struct) + \ BUILD_BUG_ON_ZERO(offsetof(struct drv_struct, member)) + \ BUILD_BUG_ON_ZERO( \ !__same_type(((struct drv_struct *)NULL)->member, \ struct ib_struct))) #define rdma_zalloc_drv_obj_gfp(ib_dev, ib_type, gfp) \ ((struct ib_type *)rdma_zalloc_obj(ib_dev, ib_dev->ops.size_##ib_type, \ gfp, false)) #define rdma_zalloc_drv_obj_numa(ib_dev, ib_type) \ ((struct ib_type *)rdma_zalloc_obj(ib_dev, ib_dev->ops.size_##ib_type, \ GFP_KERNEL, true)) #define rdma_zalloc_drv_obj(ib_dev, ib_type) \ rdma_zalloc_drv_obj_gfp(ib_dev, ib_type, GFP_KERNEL) #define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct struct rdma_user_mmap_entry { struct kref ref; struct ib_ucontext *ucontext; unsigned long start_pgoff; size_t npages; bool driver_removed; }; /* Return the offset (in bytes) the user should pass to libc's mmap() */ static inline u64 rdma_user_mmap_get_offset(const struct rdma_user_mmap_entry *entry) { return (u64)entry->start_pgoff << PAGE_SHIFT; } /** * struct ib_device_ops - InfiniBand device operations * This structure defines all the InfiniBand device operations, providers will * need to define the supported operations, otherwise they will be set to null. */ struct ib_device_ops { struct module *owner; enum rdma_driver_id driver_id; u32 uverbs_abi_ver; unsigned int uverbs_no_driver_id_binding:1; /* * NOTE: New drivers should not make use of device_group; instead new * device parameter should be exposed via netlink command. This * mechanism exists only for existing drivers. */ const struct attribute_group *device_group; const struct attribute_group **port_groups; int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr); int (*post_recv)(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); void (*drain_rq)(struct ib_qp *qp); void (*drain_sq)(struct ib_qp *qp); int (*poll_cq)(struct ib_cq *cq, int num_entries, struct ib_wc *wc); int (*peek_cq)(struct ib_cq *cq, int wc_cnt); int (*req_notify_cq)(struct ib_cq *cq, enum ib_cq_notify_flags flags); int (*post_srq_recv)(struct ib_srq *srq, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); int (*process_mad)(struct ib_device *device, int process_mad_flags, u32 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad *in_mad, struct ib_mad *out_mad, size_t *out_mad_size, u16 *out_mad_pkey_index); int (*query_device)(struct ib_device *device, struct ib_device_attr *device_attr, struct ib_udata *udata); int (*modify_device)(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); void (*get_dev_fw_str)(struct ib_device *device, char *str); const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev, int comp_vector); int (*query_port)(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr); int (*modify_port)(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); /** * The following mandatory functions are used only at device * registration. Keep functions such as these at the end of this * structure to avoid cache line misses when accessing struct ib_device * in fast paths. */ int (*get_port_immutable)(struct ib_device *device, u32 port_num, struct ib_port_immutable *immutable); enum rdma_link_layer (*get_link_layer)(struct ib_device *device, u32 port_num); /** * When calling get_netdev, the HW vendor's driver should return the * net device of device @device at port @port_num or NULL if such * a net device doesn't exist. The vendor driver should call dev_hold * on this net device. The HW vendor's device driver must guarantee * that this function returns NULL before the net device has finished * NETDEV_UNREGISTER state. */ struct net_device *(*get_netdev)(struct ib_device *device, u32 port_num); /** * rdma netdev operation * * Driver implementing alloc_rdma_netdev or rdma_netdev_get_params * must return -EOPNOTSUPP if it doesn't support the specified type. */ struct net_device *(*alloc_rdma_netdev)( struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *)); int (*rdma_netdev_get_params)(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, struct rdma_netdev_alloc_params *params); /** * query_gid should be return GID value for @device, when @port_num * link layer is either IB or iWarp. It is no-op if @port_num port * is RoCE link layer. */ int (*query_gid)(struct ib_device *device, u32 port_num, int index, union ib_gid *gid); /** * When calling add_gid, the HW vendor's driver should add the gid * of device of port at gid index available at @attr. Meta-info of * that gid (for example, the network device related to this gid) is * available at @attr. @context allows the HW vendor driver to store * extra information together with a GID entry. The HW vendor driver may * allocate memory to contain this information and store it in @context * when a new GID entry is written to. Params are consistent until the * next call of add_gid or delete_gid. The function should return 0 on * success or error otherwise. The function could be called * concurrently for different ports. This function is only called when * roce_gid_table is used. */ int (*add_gid)(const struct ib_gid_attr *attr, void **context); /** * When calling del_gid, the HW vendor's driver should delete the * gid of device @device at gid index gid_index of port port_num * available in @attr. * Upon the deletion of a GID entry, the HW vendor must free any * allocated memory. The caller will clear @context afterwards. * This function is only called when roce_gid_table is used. */ int (*del_gid)(const struct ib_gid_attr *attr, void **context); int (*query_pkey)(struct ib_device *device, u32 port_num, u16 index, u16 *pkey); int (*alloc_ucontext)(struct ib_ucontext *context, struct ib_udata *udata); void (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); /** * This will be called once refcount of an entry in mmap_xa reaches * zero. The type of the memory that was mapped may differ between * entries and is opaque to the rdma_user_mmap interface. * Therefore needs to be implemented by the driver in mmap_free. */ void (*mmap_free)(struct rdma_user_mmap_entry *entry); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata); int (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata); int (*create_ah)(struct ib_ah *ah, struct rdma_ah_init_attr *attr, struct ib_udata *udata); int (*create_user_ah)(struct ib_ah *ah, struct rdma_ah_init_attr *attr, struct ib_udata *udata); int (*modify_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); int (*query_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); int (*destroy_ah)(struct ib_ah *ah, u32 flags); int (*create_srq)(struct ib_srq *srq, struct ib_srq_init_attr *srq_init_attr, struct ib_udata *udata); int (*modify_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask, struct ib_udata *udata); int (*query_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr); int (*destroy_srq)(struct ib_srq *srq, struct ib_udata *udata); int (*create_qp)(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata); int (*modify_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_udata *udata); int (*query_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata); int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr, struct ib_udata *udata); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags); struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_udata *udata); struct ib_mr *(*reg_user_mr_dmabuf)(struct ib_pd *pd, u64 offset, u64 length, u64 virt_addr, int fd, int mr_access_flags, struct ib_udata *udata); struct ib_mr *(*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_pd *pd, struct ib_udata *udata); int (*dereg_mr)(struct ib_mr *mr, struct ib_udata *udata); struct ib_mr *(*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); struct ib_mr *(*alloc_mr_integrity)(struct ib_pd *pd, u32 max_num_data_sg, u32 max_num_meta_sg); int (*advise_mr)(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, struct ib_sge *sg_list, u32 num_sge, struct uverbs_attr_bundle *attrs); /* * Kernel users should universally support relaxed ordering (RO), as * they are designed to read data only after observing the CQE and use * the DMA API correctly. * * Some drivers implicitly enable RO if platform supports it. */ int (*map_mr_sg)(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); int (*alloc_mw)(struct ib_mw *mw, struct ib_udata *udata); int (*dealloc_mw)(struct ib_mw *mw); int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*alloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata); int (*dealloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata); struct ib_flow *(*create_flow)(struct ib_qp *qp, struct ib_flow_attr *flow_attr, struct ib_udata *udata); int (*destroy_flow)(struct ib_flow *flow_id); int (*destroy_flow_action)(struct ib_flow_action *action); int (*set_vf_link_state)(struct ib_device *device, int vf, u32 port, int state); int (*get_vf_config)(struct ib_device *device, int vf, u32 port, struct ifla_vf_info *ivf); int (*get_vf_stats)(struct ib_device *device, int vf, u32 port, struct ifla_vf_stats *stats); int (*get_vf_guid)(struct ib_device *device, int vf, u32 port, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid); int (*set_vf_guid)(struct ib_device *device, int vf, u32 port, u64 guid, int type); struct ib_wq *(*create_wq)(struct ib_pd *pd, struct ib_wq_init_attr *init_attr, struct ib_udata *udata); int (*destroy_wq)(struct ib_wq *wq, struct ib_udata *udata); int (*modify_wq)(struct ib_wq *wq, struct ib_wq_attr *attr, u32 wq_attr_mask, struct ib_udata *udata); int (*create_rwq_ind_table)(struct ib_rwq_ind_table *ib_rwq_ind_table, struct ib_rwq_ind_table_init_attr *init_attr, struct ib_udata *udata); int (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table); struct ib_dm *(*alloc_dm)(struct ib_device *device, struct ib_ucontext *context, struct ib_dm_alloc_attr *attr, struct uverbs_attr_bundle *attrs); int (*dealloc_dm)(struct ib_dm *dm, struct uverbs_attr_bundle *attrs); struct ib_mr *(*reg_dm_mr)(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr, struct uverbs_attr_bundle *attrs); int (*create_counters)(struct ib_counters *counters, struct uverbs_attr_bundle *attrs); int (*destroy_counters)(struct ib_counters *counters); int (*read_counters)(struct ib_counters *counters, struct ib_counters_read_attr *counters_read_attr, struct uverbs_attr_bundle *attrs); int (*map_mr_sg_pi)(struct ib_mr *mr, struct scatterlist *data_sg, int data_sg_nents, unsigned int *data_sg_offset, struct scatterlist *meta_sg, int meta_sg_nents, unsigned int *meta_sg_offset); /** * alloc_hw_[device,port]_stats - Allocate a struct rdma_hw_stats and * fill in the driver initialized data. The struct is kfree()'ed by * the sysfs core when the device is removed. A lifespan of -1 in the * return struct tells the core to set a default lifespan. */ struct rdma_hw_stats *(*alloc_hw_device_stats)(struct ib_device *device); struct rdma_hw_stats *(*alloc_hw_port_stats)(struct ib_device *device, u32 port_num); /** * get_hw_stats - Fill in the counter value(s) in the stats struct. * @index - The index in the value array we wish to have updated, or * num_counters if we want all stats updated * Return codes - * < 0 - Error, no counters updated * index - Updated the single counter pointed to by index * num_counters - Updated all counters (will reset the timestamp * and prevent further calls for lifespan milliseconds) * Drivers are allowed to update all counters in leiu of just the * one given in index at their option */ int (*get_hw_stats)(struct ib_device *device, struct rdma_hw_stats *stats, u32 port, int index); /** * modify_hw_stat - Modify the counter configuration * @enable: true/false when enable/disable a counter * Return codes - 0 on success or error code otherwise. */ int (*modify_hw_stat)(struct ib_device *device, u32 port, unsigned int counter_index, bool enable); /** * Allows rdma drivers to add their own restrack attributes. */ int (*fill_res_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr); int (*fill_res_mr_entry_raw)(struct sk_buff *msg, struct ib_mr *ibmr); int (*fill_res_cq_entry)(struct sk_buff *msg, struct ib_cq *ibcq); int (*fill_res_cq_entry_raw)(struct sk_buff *msg, struct ib_cq *ibcq); int (*fill_res_qp_entry)(struct sk_buff *msg, struct ib_qp *ibqp); int (*fill_res_qp_entry_raw)(struct sk_buff *msg, struct ib_qp *ibqp); int (*fill_res_cm_id_entry)(struct sk_buff *msg, struct rdma_cm_id *id); /* Device lifecycle callbacks */ /* * Called after the device becomes registered, before clients are * attached */ int (*enable_driver)(struct ib_device *dev); /* * This is called as part of ib_dealloc_device(). */ void (*dealloc_driver)(struct ib_device *dev); /* iWarp CM callbacks */ void (*iw_add_ref)(struct ib_qp *qp); void (*iw_rem_ref)(struct ib_qp *qp); struct ib_qp *(*iw_get_qp)(struct ib_device *device, int qpn); int (*iw_connect)(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int (*iw_accept)(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int (*iw_reject)(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len); int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog); int (*iw_destroy_listen)(struct iw_cm_id *cm_id); /** * counter_bind_qp - Bind a QP to a counter. * @counter - The counter to be bound. If counter->id is zero then * the driver needs to allocate a new counter and set counter->id */ int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp); /** * counter_unbind_qp - Unbind the qp from the dynamically-allocated * counter and bind it onto the default one */ int (*counter_unbind_qp)(struct ib_qp *qp); /** * counter_dealloc -De-allocate the hw counter */ int (*counter_dealloc)(struct rdma_counter *counter); /** * counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in * the driver initialized data. */ struct rdma_hw_stats *(*counter_alloc_stats)( struct rdma_counter *counter); /** * counter_update_stats - Query the stats value of this counter */ int (*counter_update_stats)(struct rdma_counter *counter); /** * Allows rdma drivers to add their own restrack attributes * dumped via 'rdma stat' iproute2 command. */ int (*fill_stat_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr); /* query driver for its ucontext properties */ int (*query_ucontext)(struct ib_ucontext *context, struct uverbs_attr_bundle *attrs); /* * Provide NUMA node. This API exists for rdmavt/hfi1 only. * Everyone else relies on Linux memory management model. */ int (*get_numa_node)(struct ib_device *dev); DECLARE_RDMA_OBJ_SIZE(ib_ah); DECLARE_RDMA_OBJ_SIZE(ib_counters); DECLARE_RDMA_OBJ_SIZE(ib_cq); DECLARE_RDMA_OBJ_SIZE(ib_mw); DECLARE_RDMA_OBJ_SIZE(ib_pd); DECLARE_RDMA_OBJ_SIZE(ib_qp); DECLARE_RDMA_OBJ_SIZE(ib_rwq_ind_table); DECLARE_RDMA_OBJ_SIZE(ib_srq); DECLARE_RDMA_OBJ_SIZE(ib_ucontext); DECLARE_RDMA_OBJ_SIZE(ib_xrcd); }; struct ib_core_device { /* device must be the first element in structure until, * union of ib_core_device and device exists in ib_device. */ struct device dev; possible_net_t rdma_net; struct kobject *ports_kobj; struct list_head port_list; struct ib_device *owner; /* reach back to owner ib_device */ }; struct rdma_restrack_root; struct ib_device { /* Do not access @dma_device directly from ULP nor from HW drivers. */ struct device *dma_device; struct ib_device_ops ops; char name[IB_DEVICE_NAME_MAX]; struct rcu_head rcu_head; struct list_head event_handler_list; /* Protects event_handler_list */ struct rw_semaphore event_handler_rwsem; /* Protects QP's event_handler calls and open_qp list */ spinlock_t qp_open_list_lock; struct rw_semaphore client_data_rwsem; struct xarray client_data; struct mutex unregistration_lock; /* Synchronize GID, Pkey cache entries, subnet prefix, LMC */ rwlock_t cache_lock; /** * port_data is indexed by port number */ struct ib_port_data *port_data; int num_comp_vectors; union { struct device dev; struct ib_core_device coredev; }; /* First group is for device attributes, * Second group is for driver provided attributes (optional). * Third group is for the hw_stats * It is a NULL terminated array. */ const struct attribute_group *groups[4]; u64 uverbs_cmd_mask; char node_desc[IB_DEVICE_NODE_DESC_MAX]; __be64 node_guid; u32 local_dma_lkey; u16 is_switch:1; /* Indicates kernel verbs support, should not be used in drivers */ u16 kverbs_provider:1; /* CQ adaptive moderation (RDMA DIM) */ u16 use_cq_dim:1; u8 node_type; u32 phys_port_cnt; struct ib_device_attr attrs; struct hw_stats_device_data *hw_stats_data; #ifdef CONFIG_CGROUP_RDMA struct rdmacg_device cg_device; #endif u32 index; spinlock_t cq_pools_lock; struct list_head cq_pools[IB_POLL_LAST_POOL_TYPE + 1]; struct rdma_restrack_root *res; const struct uapi_definition *driver_def; /* * Positive refcount indicates that the device is currently * registered and cannot be unregistered. */ refcount_t refcount; struct completion unreg_completion; struct work_struct unregistration_work; const struct rdma_link_ops *link_ops; /* Protects compat_devs xarray modifications */ struct mutex compat_devs_mutex; /* Maintains compat devices for each net namespace */ struct xarray compat_devs; /* Used by iWarp CM */ char iw_ifname[IFNAMSIZ]; u32 iw_driver_flags; u32 lag_flags; }; static inline void *rdma_zalloc_obj(struct ib_device *dev, size_t size, gfp_t gfp, bool is_numa_aware) { if (is_numa_aware && dev->ops.get_numa_node) return kzalloc_node(size, gfp, dev->ops.get_numa_node(dev)); return kzalloc(size, gfp); } struct ib_client_nl_info; struct ib_client { const char *name; int (*add)(struct ib_device *ibdev); void (*remove)(struct ib_device *, void *client_data); void (*rename)(struct ib_device *dev, void *client_data); int (*get_nl_info)(struct ib_device *ibdev, void *client_data, struct ib_client_nl_info *res); int (*get_global_nl_info)(struct ib_client_nl_info *res); /* Returns the net_dev belonging to this ib_client and matching the * given parameters. * @dev: An RDMA device that the net_dev use for communication. * @port: A physical port number on the RDMA device. * @pkey: P_Key that the net_dev uses if applicable. * @gid: A GID that the net_dev uses to communicate. * @addr: An IP address the net_dev is configured with. * @client_data: The device's client data set by ib_set_client_data(). * * An ib_client that implements a net_dev on top of RDMA devices * (such as IP over IB) should implement this callback, allowing the * rdma_cm module to find the right net_dev for a given request. * * The caller is responsible for calling dev_put on the returned * netdev. */ struct net_device *(*get_net_dev_by_params)( struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); refcount_t uses; struct completion uses_zero; u32 client_id; /* kverbs are not required by the client */ u8 no_kverbs_req:1; }; /* * IB block DMA iterator * * Iterates the DMA-mapped SGL in contiguous memory blocks aligned * to a HW supported page size. */ struct ib_block_iter { /* internal states */ struct scatterlist *__sg; /* sg holding the current aligned block */ dma_addr_t __dma_addr; /* unaligned DMA address of this block */ unsigned int __sg_nents; /* number of SG entries */ unsigned int __sg_advance; /* number of bytes to advance in sg in next step */ unsigned int __pg_bit; /* alignment of current block */ }; struct ib_device *_ib_alloc_device(size_t size); #define ib_alloc_device(drv_struct, member) \ container_of(_ib_alloc_device(sizeof(struct drv_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ struct drv_struct, member))), \ struct drv_struct, member) void ib_dealloc_device(struct ib_device *device); void ib_get_device_fw_str(struct ib_device *device, char *str); int ib_register_device(struct ib_device *device, const char *name, struct device *dma_device); void ib_unregister_device(struct ib_device *device); void ib_unregister_driver(enum rdma_driver_id driver_id); void ib_unregister_device_and_put(struct ib_device *device); void ib_unregister_device_queued(struct ib_device *ib_dev); int ib_register_client (struct ib_client *client); void ib_unregister_client(struct ib_client *client); void __rdma_block_iter_start(struct ib_block_iter *biter, struct scatterlist *sglist, unsigned int nents, unsigned long pgsz); bool __rdma_block_iter_next(struct ib_block_iter *biter); /** * rdma_block_iter_dma_address - get the aligned dma address of the current * block held by the block iterator. * @biter: block iterator holding the memory block */ static inline dma_addr_t rdma_block_iter_dma_address(struct ib_block_iter *biter) { return biter->__dma_addr & ~(BIT_ULL(biter->__pg_bit) - 1); } /** * rdma_for_each_block - iterate over contiguous memory blocks of the sg list * @sglist: sglist to iterate over * @biter: block iterator holding the memory block * @nents: maximum number of sg entries to iterate over * @pgsz: best HW supported page size to use * * Callers may use rdma_block_iter_dma_address() to get each * blocks aligned DMA address. */ #define rdma_for_each_block(sglist, biter, nents, pgsz) \ for (__rdma_block_iter_start(biter, sglist, nents, \ pgsz); \ __rdma_block_iter_next(biter);) /** * ib_get_client_data - Get IB client context * @device:Device to get context for * @client:Client to get context for * * ib_get_client_data() returns the client context data set with * ib_set_client_data(). This can only be called while the client is * registered to the device, once the ib_client remove() callback returns this * cannot be called. */ static inline void *ib_get_client_data(struct ib_device *device, struct ib_client *client) { return xa_load(&device->client_data, client->client_id); } void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); void ib_set_device_ops(struct ib_device *device, const struct ib_device_ops *ops); int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, unsigned long pfn, unsigned long size, pgprot_t prot, struct rdma_user_mmap_entry *entry); int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext, struct rdma_user_mmap_entry *entry, size_t length); int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext, struct rdma_user_mmap_entry *entry, size_t length, u32 min_pgoff, u32 max_pgoff); static inline int rdma_user_mmap_entry_insert_exact(struct ib_ucontext *ucontext, struct rdma_user_mmap_entry *entry, size_t length, u32 pgoff) { return rdma_user_mmap_entry_insert_range(ucontext, entry, length, pgoff, pgoff); } struct rdma_user_mmap_entry * rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext, unsigned long pgoff); struct rdma_user_mmap_entry * rdma_user_mmap_entry_get(struct ib_ucontext *ucontext, struct vm_area_struct *vma); void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry); void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry); static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; } static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) { return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; } static inline bool ib_is_buffer_cleared(const void __user *p, size_t len) { bool ret; u8 *buf; if (len > USHRT_MAX) return false; buf = memdup_user(p, len); if (IS_ERR(buf)) return false; ret = !memchr_inv(buf, 0, len); kfree(buf); return ret; } static inline bool ib_is_udata_cleared(struct ib_udata *udata, size_t offset, size_t len) { return ib_is_buffer_cleared(udata->inbuf + offset, len); } /** * ib_modify_qp_is_ok - Check that the supplied attribute mask * contains all required attributes and no attributes not allowed for * the given QP state transition. * @cur_state: Current QP state * @next_state: Next QP state * @type: QP type * @mask: Mask of supplied QP attributes * * This function is a helper function that a low-level driver's * modify_qp method can use to validate the consumer's input. It * checks that cur_state and next_state are valid QP states, that a * transition from cur_state to next_state is allowed by the IB spec, * and that the attribute mask supplied is allowed for the transition. */ bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, enum ib_qp_type type, enum ib_qp_attr_mask mask); void ib_register_event_handler(struct ib_event_handler *event_handler); void ib_unregister_event_handler(struct ib_event_handler *event_handler); void ib_dispatch_event(const struct ib_event *event); int ib_query_port(struct ib_device *device, u32 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u32 port_num); /** * rdma_cap_ib_switch - Check if the device is IB switch * @device: Device to check * * Device driver is responsible for setting is_switch bit on * in ib_device structure at init time. * * Return: true if the device is IB switch. */ static inline bool rdma_cap_ib_switch(const struct ib_device *device) { return device->is_switch; } /** * rdma_start_port - Return the first valid port number for the device * specified * * @device: Device to be checked * * Return start port number */ static inline u32 rdma_start_port(const struct ib_device *device) { return rdma_cap_ib_switch(device) ? 0 : 1; } /** * rdma_for_each_port - Iterate over all valid port numbers of the IB device * @device - The struct ib_device * to iterate over * @iter - The unsigned int to store the port number */ #define rdma_for_each_port(device, iter) \ for (iter = rdma_start_port(device + \ BUILD_BUG_ON_ZERO(!__same_type(u32, \ iter))); \ iter <= rdma_end_port(device); iter++) /** * rdma_end_port - Return the last valid port number for the device * specified * * @device: Device to be checked * * Return last port number */ static inline u32 rdma_end_port(const struct ib_device *device) { return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt; } static inline int rdma_is_port_valid(const struct ib_device *device, unsigned int port) { return (port >= rdma_start_port(device) && port <= rdma_end_port(device)); } static inline bool rdma_is_grh_required(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_PORT_IB_GRH_REQUIRED; } static inline bool rdma_protocol_ib(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_IB; } static inline bool rdma_protocol_roce(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP); } static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; } static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_ROCE; } static inline bool rdma_protocol_iwarp(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_IWARP; } static inline bool rdma_ib_or_roce(const struct ib_device *device, u32 port_num) { return rdma_protocol_ib(device, port_num) || rdma_protocol_roce(device, port_num); } static inline bool rdma_protocol_raw_packet(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_RAW_PACKET; } static inline bool rdma_protocol_usnic(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_PROT_USNIC; } /** * rdma_cap_ib_mad - Check if the port of a device supports Infiniband * Management Datagrams. * @device: Device to check * @port_num: Port number to check * * Management Datagrams (MAD) are a required part of the InfiniBand * specification and are supported on all InfiniBand devices. A slightly * extended version are also supported on OPA interfaces. * * Return: true if the port supports sending/receiving of MAD packets. */ static inline bool rdma_cap_ib_mad(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_MAD; } /** * rdma_cap_opa_mad - Check if the port of device provides support for OPA * Management Datagrams. * @device: Device to check * @port_num: Port number to check * * Intel OmniPath devices extend and/or replace the InfiniBand Management * datagrams with their own versions. These OPA MADs share many but not all of * the characteristics of InfiniBand MADs. * * OPA MADs differ in the following ways: * * 1) MADs are variable size up to 2K * IBTA defined MADs remain fixed at 256 bytes * 2) OPA SMPs must carry valid PKeys * 3) OPA SMP packets are a different format * * Return: true if the port supports OPA MAD packet formats. */ static inline bool rdma_cap_opa_mad(struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_OPA_MAD; } /** * rdma_cap_ib_smi - Check if the port of a device provides an Infiniband * Subnet Management Agent (SMA) on the Subnet Management Interface (SMI). * @device: Device to check * @port_num: Port number to check * * Each InfiniBand node is required to provide a Subnet Management Agent * that the subnet manager can access. Prior to the fabric being fully * configured by the subnet manager, the SMA is accessed via a well known * interface called the Subnet Management Interface (SMI). This interface * uses directed route packets to communicate with the SM to get around the * chicken and egg problem of the SM needing to know what's on the fabric * in order to configure the fabric, and needing to configure the fabric in * order to send packets to the devices on the fabric. These directed * route packets do not need the fabric fully configured in order to reach * their destination. The SMI is the only method allowed to send * directed route packets on an InfiniBand fabric. * * Return: true if the port provides an SMI. */ static inline bool rdma_cap_ib_smi(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_SMI; } /** * rdma_cap_ib_cm - Check if the port of device has the capability Infiniband * Communication Manager. * @device: Device to check * @port_num: Port number to check * * The InfiniBand Communication Manager is one of many pre-defined General * Service Agents (GSA) that are accessed via the General Service * Interface (GSI). It's role is to facilitate establishment of connections * between nodes as well as other management related tasks for established * connections. * * Return: true if the port supports an IB CM (this does not guarantee that * a CM is actually running however). */ static inline bool rdma_cap_ib_cm(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_CM; } /** * rdma_cap_iw_cm - Check if the port of device has the capability IWARP * Communication Manager. * @device: Device to check * @port_num: Port number to check * * Similar to above, but specific to iWARP connections which have a different * managment protocol than InfiniBand. * * Return: true if the port supports an iWARP CM (this does not guarantee that * a CM is actually running however). */ static inline bool rdma_cap_iw_cm(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IW_CM; } /** * rdma_cap_ib_sa - Check if the port of device has the capability Infiniband * Subnet Administration. * @device: Device to check * @port_num: Port number to check * * An InfiniBand Subnet Administration (SA) service is a pre-defined General * Service Agent (GSA) provided by the Subnet Manager (SM). On InfiniBand * fabrics, devices should resolve routes to other hosts by contacting the * SA to query the proper route. * * Return: true if the port should act as a client to the fabric Subnet * Administration interface. This does not imply that the SA service is * running locally. */ static inline bool rdma_cap_ib_sa(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_IB_SA; } /** * rdma_cap_ib_mcast - Check if the port of device has the capability Infiniband * Multicast. * @device: Device to check * @port_num: Port number to check * * InfiniBand multicast registration is more complex than normal IPv4 or * IPv6 multicast registration. Each Host Channel Adapter must register * with the Subnet Manager when it wishes to join a multicast group. It * should do so only once regardless of how many queue pairs it subscribes * to this group. And it should leave the group only after all queue pairs * attached to the group have been detached. * * Return: true if the port must undertake the additional adminstrative * overhead of registering/unregistering with the SM and tracking of the * total number of queue pairs attached to the multicast group. */ static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u32 port_num) { return rdma_cap_ib_sa(device, port_num); } /** * rdma_cap_af_ib - Check if the port of device has the capability * Native Infiniband Address. * @device: Device to check * @port_num: Port number to check * * InfiniBand addressing uses a port's GUID + Subnet Prefix to make a default * GID. RoCE uses a different mechanism, but still generates a GID via * a prescribed mechanism and port specific data. * * Return: true if the port uses a GID address to identify devices on the * network. */ static inline bool rdma_cap_af_ib(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_AF_IB; } /** * rdma_cap_eth_ah - Check if the port of device has the capability * Ethernet Address Handle. * @device: Device to check * @port_num: Port number to check * * RoCE is InfiniBand over Ethernet, and it uses a well defined technique * to fabricate GIDs over Ethernet/IP specific addresses native to the * port. Normally, packet headers are generated by the sending host * adapter, but when sending connectionless datagrams, we must manually * inject the proper headers for the fabric we are communicating over. * * Return: true if we are running as a RoCE port and must force the * addition of a Global Route Header built from our Ethernet Address * Handle into our header list for connectionless packets. */ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_ETH_AH; } /** * rdma_cap_opa_ah - Check if the port of device supports * OPA Address handles * @device: Device to check * @port_num: Port number to check * * Return: true if we are running on an OPA device which supports * the extended OPA addressing. */ static inline bool rdma_cap_opa_ah(struct ib_device *device, u32 port_num) { return (device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_OPA_AH) == RDMA_CORE_CAP_OPA_AH; } /** * rdma_max_mad_size - Return the max MAD size required by this RDMA Port. * * @device: Device * @port_num: Port number * * This MAD size includes the MAD headers and MAD payload. No other headers * are included. * * Return the max MAD size required by the Port. Will return 0 if the port * does not support MADs */ static inline size_t rdma_max_mad_size(const struct ib_device *device, u32 port_num) { return device->port_data[port_num].immutable.max_mad_size; } /** * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table * @device: Device to check * @port_num: Port number to check * * RoCE GID table mechanism manages the various GIDs for a device. * * NOTE: if allocating the port's GID table has failed, this call will still * return true, but any RoCE GID table API will fail. * * Return: true if the port uses RoCE GID table mechanism in order to manage * its GIDs. */ static inline bool rdma_cap_roce_gid_table(const struct ib_device *device, u32 port_num) { return rdma_protocol_roce(device, port_num) && device->ops.add_gid && device->ops.del_gid; } /* * Check if the device supports READ W/ INVALIDATE. */ static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num) { /* * iWarp drivers must support READ W/ INVALIDATE. No other protocol * has support for it yet. */ return rdma_protocol_iwarp(dev, port_num); } /** * rdma_core_cap_opa_port - Return whether the RDMA Port is OPA or not. * @device: Device * @port_num: 1 based Port number * * Return true if port is an Intel OPA port , false if not */ static inline bool rdma_core_cap_opa_port(struct ib_device *device, u32 port_num) { return (device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_PORT_INTEL_OPA) == RDMA_CORE_PORT_INTEL_OPA; } /** * rdma_mtu_enum_to_int - Return the mtu of the port as an integer value. * @device: Device * @port_num: Port number * @mtu: enum value of MTU * * Return the MTU size supported by the port as an integer value. Will return * -1 if enum value of mtu is not supported. */ static inline int rdma_mtu_enum_to_int(struct ib_device *device, u32 port, int mtu) { if (rdma_core_cap_opa_port(device, port)) return opa_mtu_enum_to_int((enum opa_mtu)mtu); else return ib_mtu_enum_to_int((enum ib_mtu)mtu); } /** * rdma_mtu_from_attr - Return the mtu of the port from the port attribute. * @device: Device * @port_num: Port number * @attr: port attribute * * Return the MTU size supported by the port as an integer value. */ static inline int rdma_mtu_from_attr(struct ib_device *device, u32 port, struct ib_port_attr *attr) { if (rdma_core_cap_opa_port(device, port)) return attr->phys_mtu; else return ib_mtu_enum_to_int(attr->max_mtu); } int ib_set_vf_link_state(struct ib_device *device, int vf, u32 port, int state); int ib_get_vf_config(struct ib_device *device, int vf, u32 port, struct ifla_vf_info *info); int ib_get_vf_stats(struct ib_device *device, int vf, u32 port, struct ifla_vf_stats *stats); int ib_get_vf_guid(struct ib_device *device, int vf, u32 port, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid); int ib_set_vf_guid(struct ib_device *device, int vf, u32 port, u64 guid, int type); int ib_query_pkey(struct ib_device *device, u32 port_num, u16 index, u16 *pkey); int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify); int ib_modify_port(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); int ib_find_gid(struct ib_device *device, union ib_gid *gid, u32 *port_num, u16 *index); int ib_find_pkey(struct ib_device *device, u32 port_num, u16 pkey, u16 *index); enum ib_pd_flags { /* * Create a memory registration for all memory in the system and place * the rkey for it into pd->unsafe_global_rkey. This can be used by * ULPs to avoid the overhead of dynamic MRs. * * This flag is generally considered unsafe and must only be used in * extremly trusted environments. Every use of it will log a warning * in the kernel log. */ IB_PD_UNSAFE_GLOBAL_RKEY = 0x01, }; struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, const char *caller); /** * ib_alloc_pd - Allocates an unused protection domain. * @device: The device on which to allocate the protection domain. * @flags: protection domain flags * * A protection domain object provides an association between QPs, shared * receive queues, address handles, memory regions, and memory windows. * * Every PD has a local_dma_lkey which can be used as the lkey value for local * memory operations. */ #define ib_alloc_pd(device, flags) \ __ib_alloc_pd((device), (flags), KBUILD_MODNAME) int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata); /** * ib_dealloc_pd - Deallocate kernel PD * @pd: The protection domain * * NOTE: for user PD use ib_dealloc_pd_user with valid udata! */ static inline void ib_dealloc_pd(struct ib_pd *pd) { int ret = ib_dealloc_pd_user(pd, NULL); WARN_ONCE(ret, "Destroy of kernel PD shouldn't fail"); } enum rdma_create_ah_flags { /* In a sleepable context */ RDMA_CREATE_AH_SLEEPABLE = BIT(0), }; /** * rdma_create_ah - Creates an address handle for the given address vector. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * @flags: Create address handle flags (see enum rdma_create_ah_flags). * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags); /** * rdma_create_user_ah - Creates an address handle for the given address vector. * It resolves destination mac address for ah attribute of RoCE type. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * @udata: pointer to user's input output buffer information need by * provider driver. * * It returns 0 on success and returns appropriate error code on error. * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *rdma_create_user_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, struct ib_udata *udata); /** * ib_get_gids_from_rdma_hdr - Get sgid and dgid from GRH or IPv4 header * work completion. * @hdr: the L3 header to parse * @net_type: type of header to parse * @sgid: place to store source gid * @dgid: place to store destination gid */ int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, enum rdma_network_type net_type, union ib_gid *sgid, union ib_gid *dgid); /** * ib_get_rdma_header_version - Get the header version * @hdr: the L3 header to parse */ int ib_get_rdma_header_version(const union rdma_network_hdr *hdr); /** * ib_init_ah_attr_from_wc - Initializes address handle attributes from a * work completion. * @device: Device on which the received message arrived. * @port_num: Port on which the received message arrived. * @wc: Work completion associated with the received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @ah_attr: Returned attributes that can be used when creating an address * handle for replying to the message. * When ib_init_ah_attr_from_wc() returns success, * (a) for IB link layer it optionally contains a reference to SGID attribute * when GRH is present for IB link layer. * (b) for RoCE link layer it contains a reference to SGID attribute. * User must invoke rdma_cleanup_ah_attr_gid_attr() to release reference to SGID * attributes which are initialized using ib_init_ah_attr_from_wc(). * */ int ib_init_ah_attr_from_wc(struct ib_device *device, u32 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct rdma_ah_attr *ah_attr); /** * ib_create_ah_from_wc - Creates an address handle associated with the * sender of the specified work completion. * @pd: The protection domain associated with the address handle. * @wc: Work completion information associated with a received message. * @grh: References the received global route header. This parameter is * ignored unless the work completion indicates that the GRH is valid. * @port_num: The outbound port number to associate with the address. * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, const struct ib_grh *grh, u32 port_num); /** * rdma_modify_ah - Modifies the address vector associated with an address * handle. * @ah: The address handle to modify. * @ah_attr: The new address vector attributes to associate with the * address handle. */ int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); /** * rdma_query_ah - Queries the address vector associated with an address * handle. * @ah: The address handle to query. * @ah_attr: The address vector attributes associated with the address * handle. */ int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); enum rdma_destroy_ah_flags { /* In a sleepable context */ RDMA_DESTROY_AH_SLEEPABLE = BIT(0), }; /** * rdma_destroy_ah_user - Destroys an address handle. * @ah: The address handle to destroy. * @flags: Destroy address handle flags (see enum rdma_destroy_ah_flags). * @udata: Valid user data or NULL for kernel objects */ int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata); /** * rdma_destroy_ah - Destroys an kernel address handle. * @ah: The address handle to destroy. * @flags: Destroy address handle flags (see enum rdma_destroy_ah_flags). * * NOTE: for user ah use rdma_destroy_ah_user with valid udata! */ static inline void rdma_destroy_ah(struct ib_ah *ah, u32 flags) { int ret = rdma_destroy_ah_user(ah, flags, NULL); WARN_ONCE(ret, "Destroy of kernel AH shouldn't fail"); } struct ib_srq *ib_create_srq_user(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr, struct ib_usrq_object *uobject, struct ib_udata *udata); static inline struct ib_srq * ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr) { if (!pd->device->ops.create_srq) return ERR_PTR(-EOPNOTSUPP); return ib_create_srq_user(pd, srq_init_attr, NULL, NULL); } /** * ib_modify_srq - Modifies the attributes for the specified SRQ. * @srq: The SRQ to modify. * @srq_attr: On input, specifies the SRQ attributes to modify. On output, * the current values of selected SRQ attributes are returned. * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ * are being modified. * * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or * IB_SRQ_LIMIT to set the SRQ's limit and request notification when * the number of receives queued drops below the limit. */ int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask); /** * ib_query_srq - Returns the attribute list and current values for the * specified SRQ. * @srq: The SRQ to query. * @srq_attr: The attributes of the specified SRQ. */ int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); /** * ib_destroy_srq_user - Destroys the specified SRQ. * @srq: The SRQ to destroy. * @udata: Valid user data or NULL for kernel objects */ int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata); /** * ib_destroy_srq - Destroys the specified kernel SRQ. * @srq: The SRQ to destroy. * * NOTE: for user srq use ib_destroy_srq_user with valid udata! */ static inline void ib_destroy_srq(struct ib_srq *srq) { int ret = ib_destroy_srq_user(srq, NULL); WARN_ONCE(ret, "Destroy of kernel SRQ shouldn't fail"); } /** * ib_post_srq_recv - Posts a list of work requests to the specified SRQ. * @srq: The SRQ to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_srq_recv(struct ib_srq *srq, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr) { const struct ib_recv_wr *dummy; return srq->device->ops.post_srq_recv(srq, recv_wr, bad_recv_wr ? : &dummy); } struct ib_qp *ib_create_qp_kernel(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr, const char *caller); /** * ib_create_qp - Creates a kernel QP associated with the specific protection * domain. * @pd: The protection domain associated with the QP. * @init_attr: A list of initial attributes required to create the * QP. If QP creation succeeds, then the attributes are updated to * the actual capabilities of the created QP. */ static inline struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr) { return ib_create_qp_kernel(pd, init_attr, KBUILD_MODNAME); } /** * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. * @qp: The QP to modify. * @attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @attr_mask: A bit-mask used to specify which attributes of the QP * are being modified. * @udata: pointer to user's input output buffer information * are being modified. * It returns 0 on success and returns appropriate error code on error. */ int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); /** * ib_modify_qp - Modifies the attributes for the specified QP and then * transitions the QP to the given state. * @qp: The QP to modify. * @qp_attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @qp_attr_mask: A bit-mask used to specify which attributes of the QP * are being modified. */ int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask); /** * ib_query_qp - Returns the attribute list and current values for the * specified QP. * @qp: The QP to query. * @qp_attr: The attributes of the specified QP. * @qp_attr_mask: A bit-mask used to select specific attributes to query. * @qp_init_attr: Additional attributes of the selected QP. * * The qp_attr_mask may be used to limit the query to gathering only the * selected attributes. */ int ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); /** * ib_destroy_qp - Destroys the specified QP. * @qp: The QP to destroy. * @udata: Valid udata or NULL for kernel objects */ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata); /** * ib_destroy_qp - Destroys the specified kernel QP. * @qp: The QP to destroy. * * NOTE: for user qp use ib_destroy_qp_user with valid udata! */ static inline int ib_destroy_qp(struct ib_qp *qp) { return ib_destroy_qp_user(qp, NULL); } /** * ib_open_qp - Obtain a reference to an existing sharable QP. * @xrcd - XRC domain * @qp_open_attr: Attributes identifying the QP to open. * * Returns a reference to a sharable QP. */ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, struct ib_qp_open_attr *qp_open_attr); /** * ib_close_qp - Release an external reference to a QP. * @qp: The QP handle to release * * The opened QP handle is released by the caller. The underlying * shared QP is not destroyed until all internal references are released. */ int ib_close_qp(struct ib_qp *qp); /** * ib_post_send - Posts a list of work requests to the send queue of * the specified QP. * @qp: The QP to post the work request on. * @send_wr: A list of work requests to post on the send queue. * @bad_send_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. * * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate * error is returned, the QP state shall not be affected, * ib_post_send() will return an immediate error after queueing any * earlier work requests in the list. */ static inline int ib_post_send(struct ib_qp *qp, const struct ib_send_wr *send_wr, const struct ib_send_wr **bad_send_wr) { const struct ib_send_wr *dummy; return qp->device->ops.post_send(qp, send_wr, bad_send_wr ? : &dummy); } /** * ib_post_recv - Posts a list of work requests to the receive queue of * the specified QP. * @qp: The QP to post the work request on. * @recv_wr: A list of work requests to post on the receive queue. * @bad_recv_wr: On an immediate failure, this parameter will reference * the work request that failed to be posted on the QP. */ static inline int ib_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr) { const struct ib_recv_wr *dummy; return qp->device->ops.post_recv(qp, recv_wr, bad_recv_wr ? : &dummy); } struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx, const char *caller); static inline struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx) { return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx, KBUILD_MODNAME); } struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private, int nr_cqe, enum ib_poll_context poll_ctx, const char *caller); /** * ib_alloc_cq_any: Allocate kernel CQ * @dev: The IB device * @private: Private data attached to the CQE * @nr_cqe: Number of CQEs in the CQ * @poll_ctx: Context used for polling the CQ */ static inline struct ib_cq *ib_alloc_cq_any(struct ib_device *dev, void *private, int nr_cqe, enum ib_poll_context poll_ctx) { return __ib_alloc_cq_any(dev, private, nr_cqe, poll_ctx, KBUILD_MODNAME); } void ib_free_cq(struct ib_cq *cq); int ib_process_cq_direct(struct ib_cq *cq, int budget); /** * ib_create_cq - Creates a CQ on the specified device. * @device: The device on which to create the CQ. * @comp_handler: A user-specified callback that is invoked when a * completion event occurs on the CQ. * @event_handler: A user-specified callback that is invoked when an * asynchronous event not associated with a completion occurs on the CQ. * @cq_context: Context associated with the CQ returned to the user via * the associated completion and event handlers. * @cq_attr: The attributes the CQ should be created upon. * * Users can examine the cq structure to determine the actual CQ size. */ struct ib_cq *__ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), void *cq_context, const struct ib_cq_init_attr *cq_attr, const char *caller); #define ib_create_cq(device, cmp_hndlr, evt_hndlr, cq_ctxt, cq_attr) \ __ib_create_cq((device), (cmp_hndlr), (evt_hndlr), (cq_ctxt), (cq_attr), KBUILD_MODNAME) /** * ib_resize_cq - Modifies the capacity of the CQ. * @cq: The CQ to resize. * @cqe: The minimum size of the CQ. * * Users can examine the cq structure to determine the actual CQ size. */ int ib_resize_cq(struct ib_cq *cq, int cqe); /** * rdma_set_cq_moderation - Modifies moderation params of the CQ * @cq: The CQ to modify. * @cq_count: number of CQEs that will trigger an event * @cq_period: max period of time in usec before triggering an event * */ int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period); /** * ib_destroy_cq_user - Destroys the specified CQ. * @cq: The CQ to destroy. * @udata: Valid user data or NULL for kernel objects */ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata); /** * ib_destroy_cq - Destroys the specified kernel CQ. * @cq: The CQ to destroy. * * NOTE: for user cq use ib_destroy_cq_user with valid udata! */ static inline void ib_destroy_cq(struct ib_cq *cq) { int ret = ib_destroy_cq_user(cq, NULL); WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail"); } /** * ib_poll_cq - poll a CQ for completion(s) * @cq:the CQ being polled * @num_entries:maximum number of completions to return * @wc:array of at least @num_entries &struct ib_wc where completions * will be returned * * Poll a CQ for (possibly multiple) completions. If the return value * is < 0, an error occurred. If the return value is >= 0, it is the * number of completions returned. If the return value is * non-negative and < num_entries, then the CQ was emptied. */ static inline int ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc) { return cq->device->ops.poll_cq(cq, num_entries, wc); } /** * ib_req_notify_cq - Request completion notification on a CQ. * @cq: The CQ to generate an event for. * @flags: * Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP * to request an event on the next solicited event or next work * completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS * may also be |ed in to request a hint about missed events, as * described below. * * Return Value: * < 0 means an error occurred while requesting notification * == 0 means notification was requested successfully, and if * IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events * were missed and it is safe to wait for another event. In * this case is it guaranteed that any work completions added * to the CQ since the last CQ poll will trigger a completion * notification event. * > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed * in. It means that the consumer must poll the CQ again to * make sure it is empty to avoid missing an event because of a * race between requesting notification and an entry being * added to the CQ. This return value means it is possible * (but not guaranteed) that a work completion has been added * to the CQ since the last poll without triggering a * completion notification event. */ static inline int ib_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags) { return cq->device->ops.req_notify_cq(cq, flags); } struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe, int comp_vector_hint, enum ib_poll_context poll_ctx); void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe); /* * Drivers that don't need a DMA mapping at the RDMA layer, set dma_device to * NULL. This causes the ib_dma* helpers to just stash the kernel virtual * address into the dma address. */ static inline bool ib_uses_virt_dma(struct ib_device *dev) { return IS_ENABLED(CONFIG_INFINIBAND_VIRT_DMA) && !dev->dma_device; } /* * Check if a IB device's underlying DMA mapping supports P2PDMA transfers. */ static inline bool ib_dma_pci_p2p_dma_supported(struct ib_device *dev) { if (ib_uses_virt_dma(dev)) return false; return dma_pci_p2pdma_supported(dev->dma_device); } /** * ib_virt_dma_to_ptr - Convert a dma_addr to a kernel pointer * @dma_addr: The DMA address * * Used by ib_uses_virt_dma() devices to get back to the kernel pointer after * going through the dma_addr marshalling. */ static inline void *ib_virt_dma_to_ptr(u64 dma_addr) { /* virt_dma mode maps the kvs's directly into the dma addr */ return (void *)(uintptr_t)dma_addr; } /** * ib_virt_dma_to_page - Convert a dma_addr to a struct page * @dma_addr: The DMA address * * Used by ib_uses_virt_dma() device to get back to the struct page after going * through the dma_addr marshalling. */ static inline struct page *ib_virt_dma_to_page(u64 dma_addr) { return virt_to_page(ib_virt_dma_to_ptr(dma_addr)); } /** * ib_dma_mapping_error - check a DMA addr for error * @dev: The device for which the dma_addr was created * @dma_addr: The DMA address to check */ static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr) { if (ib_uses_virt_dma(dev)) return 0; return dma_mapping_error(dev->dma_device, dma_addr); } /** * ib_dma_map_single - Map a kernel virtual address to DMA address * @dev: The device for which the dma_addr is to be created * @cpu_addr: The kernel virtual address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_single(struct ib_device *dev, void *cpu_addr, size_t size, enum dma_data_direction direction) { if (ib_uses_virt_dma(dev)) return (uintptr_t)cpu_addr; return dma_map_single(dev->dma_device, cpu_addr, size, direction); } /** * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (!ib_uses_virt_dma(dev)) dma_unmap_single(dev->dma_device, addr, size, direction); } /** * ib_dma_map_page - Map a physical page to DMA address * @dev: The device for which the dma_addr is to be created * @page: The page to be mapped * @offset: The offset within the page * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline u64 ib_dma_map_page(struct ib_device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction direction) { if (ib_uses_virt_dma(dev)) return (uintptr_t)(page_address(page) + offset); return dma_map_page(dev->dma_device, page, offset, size, direction); } /** * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page() * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @direction: The direction of the DMA */ static inline void ib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction direction) { if (!ib_uses_virt_dma(dev)) dma_unmap_page(dev->dma_device, addr, size, direction); } int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents); static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, unsigned long dma_attrs) { if (ib_uses_virt_dma(dev)) return ib_dma_virt_map_sg(dev, sg, nents); return dma_map_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs); } static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, unsigned long dma_attrs) { if (!ib_uses_virt_dma(dev)) dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs); } /** * ib_dma_map_sgtable_attrs - Map a scatter/gather table to DMA addresses * @dev: The device for which the DMA addresses are to be created * @sg: The sg_table object describing the buffer * @direction: The direction of the DMA * @attrs: Optional DMA attributes for the map operation */ static inline int ib_dma_map_sgtable_attrs(struct ib_device *dev, struct sg_table *sgt, enum dma_data_direction direction, unsigned long dma_attrs) { int nents; if (ib_uses_virt_dma(dev)) { nents = ib_dma_virt_map_sg(dev, sgt->sgl, sgt->orig_nents); if (!nents) return -EIO; sgt->nents = nents; return 0; } return dma_map_sgtable(dev->dma_device, sgt, direction, dma_attrs); } static inline void ib_dma_unmap_sgtable_attrs(struct ib_device *dev, struct sg_table *sgt, enum dma_data_direction direction, unsigned long dma_attrs) { if (!ib_uses_virt_dma(dev)) dma_unmap_sgtable(dev->dma_device, sgt, direction, dma_attrs); } /** * ib_dma_map_sg - Map a scatter/gather list to DMA addresses * @dev: The device for which the DMA addresses are to be created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline int ib_dma_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { return ib_dma_map_sg_attrs(dev, sg, nents, direction, 0); } /** * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses * @dev: The device for which the DMA addresses were created * @sg: The array of scatter/gather entries * @nents: The number of scatter/gather entries * @direction: The direction of the DMA */ static inline void ib_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction) { ib_dma_unmap_sg_attrs(dev, sg, nents, direction, 0); } /** * ib_dma_max_seg_size - Return the size limit of a single DMA transfer * @dev: The device to query * * The returned value represents a size in bytes. */ static inline unsigned int ib_dma_max_seg_size(struct ib_device *dev) { if (ib_uses_virt_dma(dev)) return UINT_MAX; return dma_get_max_seg_size(dev->dma_device); } /** * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (!ib_uses_virt_dma(dev)) dma_sync_single_for_cpu(dev->dma_device, addr, size, dir); } /** * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device * @dev: The device for which the DMA address was created * @addr: The DMA address * @size: The size of the region in bytes * @dir: The direction of the DMA */ static inline void ib_dma_sync_single_for_device(struct ib_device *dev, u64 addr, size_t size, enum dma_data_direction dir) { if (!ib_uses_virt_dma(dev)) dma_sync_single_for_device(dev->dma_device, addr, size, dir); } /* ib_reg_user_mr - register a memory region for virtual addresses from kernel * space. This function should be called when 'current' is the owning MM. */ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags); /* ib_advise_mr - give an advice about an address range in a memory region */ int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, struct ib_sge *sg_list, u32 num_sge); /** * ib_dereg_mr_user - Deregisters a memory region and removes it from the * HCA translation table. * @mr: The memory region to deregister. * @udata: Valid user data or NULL for kernel object * * This function can fail, if the memory region has memory windows bound to it. */ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata); /** * ib_dereg_mr - Deregisters a kernel memory region and removes it from the * HCA translation table. * @mr: The memory region to deregister. * * This function can fail, if the memory region has memory windows bound to it. * * NOTE: for user mr use ib_dereg_mr_user with valid udata! */ static inline int ib_dereg_mr(struct ib_mr *mr) { return ib_dereg_mr_user(mr, NULL); } struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, u32 max_num_data_sg, u32 max_num_meta_sg); /** * ib_update_fast_reg_key - updates the key portion of the fast_reg MR * R_Key and L_Key. * @mr - struct ib_mr pointer to be updated. * @newkey - new key to be used. */ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) { mr->lkey = (mr->lkey & 0xffffff00) | newkey; mr->rkey = (mr->rkey & 0xffffff00) | newkey; } /** * ib_inc_rkey - increments the key portion of the given rkey. Can be used * for calculating a new rkey for type 2 memory windows. * @rkey - the rkey to increment. */ static inline u32 ib_inc_rkey(u32 rkey) { const u32 mask = 0x000000ff; return ((rkey + 1) & mask) | (rkey & ~mask); } /** * ib_attach_mcast - Attaches the specified QP to a multicast group. * @qp: QP to attach to the multicast group. The QP must be type * IB_QPT_UD. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. * * In order to send and receive multicast packets, subnet * administration must have created the multicast group and configured * the fabric appropriately. The port associated with the specified * QP must also be a member of the multicast group. */ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); /** * ib_detach_mcast - Detaches the specified QP from a multicast group. * @qp: QP to detach from the multicast group. * @gid: Multicast group GID. * @lid: Multicast group LID in host byte order. */ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); struct ib_xrcd *ib_alloc_xrcd_user(struct ib_device *device, struct inode *inode, struct ib_udata *udata); int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata); static inline int ib_check_mr_access(struct ib_device *ib_dev, unsigned int flags) { u64 device_cap = ib_dev->attrs.device_cap_flags; /* * Local write permission is required if remote write or * remote atomic permission is also requested. */ if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && !(flags & IB_ACCESS_LOCAL_WRITE)) return -EINVAL; if (flags & ~IB_ACCESS_SUPPORTED) return -EINVAL; if (flags & IB_ACCESS_ON_DEMAND && !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING)) return -EOPNOTSUPP; if ((flags & IB_ACCESS_FLUSH_GLOBAL && !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) || (flags & IB_ACCESS_FLUSH_PERSISTENT && !(device_cap & IB_DEVICE_FLUSH_PERSISTENT))) return -EOPNOTSUPP; return 0; } static inline bool ib_access_writable(int access_flags) { /* * We have writable memory backing the MR if any of the following * access flags are set. "Local write" and "remote write" obviously * require write access. "Remote atomic" can do things like fetch and * add, which will modify memory, and "MW bind" can change permissions * by binding a window. */ return access_flags & (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND); } /** * ib_check_mr_status: lightweight check of MR status. * This routine may provide status checks on a selected * ib_mr. first use is for signature status check. * * @mr: A memory region. * @check_mask: Bitmask of which checks to perform from * ib_mr_status_check enumeration. * @mr_status: The container of relevant status checks. * failed checks will be indicated in the status bitmask * and the relevant info shall be in the error item. */ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); /** * ib_device_try_get: Hold a registration lock * device: The device to lock * * A device under an active registration lock cannot become unregistered. It * is only possible to obtain a registration lock on a device that is fully * registered, otherwise this function returns false. * * The registration lock is only necessary for actions which require the * device to still be registered. Uses that only require the device pointer to * be valid should use get_device(&ibdev->dev) to hold the memory. * */ static inline bool ib_device_try_get(struct ib_device *dev) { return refcount_inc_not_zero(&dev->refcount); } void ib_device_put(struct ib_device *device); struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, enum rdma_driver_id driver_id); struct ib_device *ib_device_get_by_name(const char *name, enum rdma_driver_id driver_id); struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, unsigned int port); struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr); int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata); int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size); int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg, int data_sg_nents, unsigned int *data_sg_offset, struct scatterlist *meta_sg, int meta_sg_nents, unsigned int *meta_sg_offset, unsigned int page_size); static inline int ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size) { int n; n = ib_map_mr_sg(mr, sg, sg_nents, sg_offset, page_size); mr->iova = 0; return n; } int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, unsigned int *sg_offset, int (*set_page)(struct ib_mr *, u64)); void ib_drain_rq(struct ib_qp *qp); void ib_drain_sq(struct ib_qp *qp); void ib_drain_qp(struct ib_qp *qp); int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed, u8 *width); static inline u8 *rdma_ah_retrieve_dmac(struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_ROCE) return attr->roce.dmac; return NULL; } static inline void rdma_ah_set_dlid(struct rdma_ah_attr *attr, u32 dlid) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) attr->ib.dlid = (u16)dlid; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) attr->opa.dlid = dlid; } static inline u32 rdma_ah_get_dlid(const struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) return attr->ib.dlid; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) return attr->opa.dlid; return 0; } static inline void rdma_ah_set_sl(struct rdma_ah_attr *attr, u8 sl) { attr->sl = sl; } static inline u8 rdma_ah_get_sl(const struct rdma_ah_attr *attr) { return attr->sl; } static inline void rdma_ah_set_path_bits(struct rdma_ah_attr *attr, u8 src_path_bits) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) attr->ib.src_path_bits = src_path_bits; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) attr->opa.src_path_bits = src_path_bits; } static inline u8 rdma_ah_get_path_bits(const struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_IB) return attr->ib.src_path_bits; else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) return attr->opa.src_path_bits; return 0; } static inline void rdma_ah_set_make_grd(struct rdma_ah_attr *attr, bool make_grd) { if (attr->type == RDMA_AH_ATTR_TYPE_OPA) attr->opa.make_grd = make_grd; } static inline bool rdma_ah_get_make_grd(const struct rdma_ah_attr *attr) { if (attr->type == RDMA_AH_ATTR_TYPE_OPA) return attr->opa.make_grd; return false; } static inline void rdma_ah_set_port_num(struct rdma_ah_attr *attr, u32 port_num) { attr->port_num = port_num; } static inline u32 rdma_ah_get_port_num(const struct rdma_ah_attr *attr) { return attr->port_num; } static inline void rdma_ah_set_static_rate(struct rdma_ah_attr *attr, u8 static_rate) { attr->static_rate = static_rate; } static inline u8 rdma_ah_get_static_rate(const struct rdma_ah_attr *attr) { return attr->static_rate; } static inline void rdma_ah_set_ah_flags(struct rdma_ah_attr *attr, enum ib_ah_flags flag) { attr->ah_flags = flag; } static inline enum ib_ah_flags rdma_ah_get_ah_flags(const struct rdma_ah_attr *attr) { return attr->ah_flags; } static inline const struct ib_global_route *rdma_ah_read_grh(const struct rdma_ah_attr *attr) { return &attr->grh; } /*To retrieve and modify the grh */ static inline struct ib_global_route *rdma_ah_retrieve_grh(struct rdma_ah_attr *attr) { return &attr->grh; } static inline void rdma_ah_set_dgid_raw(struct rdma_ah_attr *attr, void *dgid) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); memcpy(grh->dgid.raw, dgid, sizeof(grh->dgid)); } static inline void rdma_ah_set_subnet_prefix(struct rdma_ah_attr *attr, __be64 prefix) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); grh->dgid.global.subnet_prefix = prefix; } static inline void rdma_ah_set_interface_id(struct rdma_ah_attr *attr, __be64 if_id) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); grh->dgid.global.interface_id = if_id; } static inline void rdma_ah_set_grh(struct rdma_ah_attr *attr, union ib_gid *dgid, u32 flow_label, u8 sgid_index, u8 hop_limit, u8 traffic_class) { struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); attr->ah_flags = IB_AH_GRH; if (dgid) grh->dgid = *dgid; grh->flow_label = flow_label; grh->sgid_index = sgid_index; grh->hop_limit = hop_limit; grh->traffic_class = traffic_class; grh->sgid_attr = NULL; } void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr); void rdma_move_grh_sgid_attr(struct rdma_ah_attr *attr, union ib_gid *dgid, u32 flow_label, u8 hop_limit, u8 traffic_class, const struct ib_gid_attr *sgid_attr); void rdma_copy_ah_attr(struct rdma_ah_attr *dest, const struct rdma_ah_attr *src); void rdma_replace_ah_attr(struct rdma_ah_attr *old, const struct rdma_ah_attr *new); void rdma_move_ah_attr(struct rdma_ah_attr *dest, struct rdma_ah_attr *src); /** * rdma_ah_find_type - Return address handle type. * * @dev: Device to be checked * @port_num: Port number */ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev, u32 port_num) { if (rdma_protocol_roce(dev, port_num)) return RDMA_AH_ATTR_TYPE_ROCE; if (rdma_protocol_ib(dev, port_num)) { if (rdma_cap_opa_ah(dev, port_num)) return RDMA_AH_ATTR_TYPE_OPA; return RDMA_AH_ATTR_TYPE_IB; } return RDMA_AH_ATTR_TYPE_UNDEFINED; } /** * ib_lid_cpu16 - Return lid in 16bit CPU encoding. * In the current implementation the only way to * get the 32bit lid is from other sources for OPA. * For IB, lids will always be 16bits so cast the * value accordingly. * * @lid: A 32bit LID */ static inline u16 ib_lid_cpu16(u32 lid) { WARN_ON_ONCE(lid & 0xFFFF0000); return (u16)lid; } /** * ib_lid_be16 - Return lid in 16bit BE encoding. * * @lid: A 32bit LID */ static inline __be16 ib_lid_be16(u32 lid) { WARN_ON_ONCE(lid & 0xFFFF0000); return cpu_to_be16((u16)lid); } /** * ib_get_vector_affinity - Get the affinity mappings of a given completion * vector * @device: the rdma device * @comp_vector: index of completion vector * * Returns NULL on failure, otherwise a corresponding cpu map of the * completion vector (returns all-cpus map if the device driver doesn't * implement get_vector_affinity). */ static inline const struct cpumask * ib_get_vector_affinity(struct ib_device *device, int comp_vector) { if (comp_vector < 0 || comp_vector >= device->num_comp_vectors || !device->ops.get_vector_affinity) return NULL; return device->ops.get_vector_affinity(device, comp_vector); } /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * * @device: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ibdev); struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile); int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs); struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *)); int rdma_init_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), struct net_device *netdev); /** * rdma_device_to_ibdev - Get ib_device pointer from device pointer * * @device: device pointer for which ib_device pointer to retrieve * * rdma_device_to_ibdev() retrieves ib_device pointer from device. * */ static inline struct ib_device *rdma_device_to_ibdev(struct device *device) { struct ib_core_device *coredev = container_of(device, struct ib_core_device, dev); return coredev->owner; } /** * ibdev_to_node - return the NUMA node for a given ib_device * @dev: device to get the NUMA node for. */ static inline int ibdev_to_node(struct ib_device *ibdev) { struct device *parent = ibdev->dev.parent; if (!parent) return NUMA_NO_NODE; return dev_to_node(parent); } /** * rdma_device_to_drv_device - Helper macro to reach back to driver's * ib_device holder structure from device pointer. * * NOTE: New drivers should not make use of this API; This API is only for * existing drivers who have exposed sysfs entries using * ops->device_group. */ #define rdma_device_to_drv_device(dev, drv_dev_struct, ibdev_member) \ container_of(rdma_device_to_ibdev(dev), drv_dev_struct, ibdev_member) bool rdma_dev_access_netns(const struct ib_device *device, const struct net *net); #define IB_ROCE_UDP_ENCAP_VALID_PORT_MIN (0xC000) #define IB_ROCE_UDP_ENCAP_VALID_PORT_MAX (0xFFFF) #define IB_GRH_FLOWLABEL_MASK (0x000FFFFF) /** * rdma_flow_label_to_udp_sport - generate a RoCE v2 UDP src port value based * on the flow_label * * This function will convert the 20 bit flow_label input to a valid RoCE v2 * UDP src port 14 bit value. All RoCE V2 drivers should use this same * convention. */ static inline u16 rdma_flow_label_to_udp_sport(u32 fl) { u32 fl_low = fl & 0x03fff, fl_high = fl & 0xFC000; fl_low ^= fl_high >> 14; return (u16)(fl_low | IB_ROCE_UDP_ENCAP_VALID_PORT_MIN); } /** * rdma_calc_flow_label - generate a RDMA symmetric flow label value based on * local and remote qpn values * * This function folded the multiplication results of two qpns, 24 bit each, * fields, and converts it to a 20 bit results. * * This function will create symmetric flow_label value based on the local * and remote qpn values. this will allow both the requester and responder * to calculate the same flow_label for a given connection. * * This helper function should be used by driver in case the upper layer * provide a zero flow_label value. This is to improve entropy of RDMA * traffic in the network. */ static inline u32 rdma_calc_flow_label(u32 lqpn, u32 rqpn) { u64 v = (u64)lqpn * rqpn; v ^= v >> 20; v ^= v >> 40; return (u32)(v & IB_GRH_FLOWLABEL_MASK); } /** * rdma_get_udp_sport - Calculate and set UDP source port based on the flow * label. If flow label is not defined in GRH then * calculate it based on lqpn/rqpn. * * @fl: flow label from GRH * @lqpn: local qp number * @rqpn: remote qp number */ static inline u16 rdma_get_udp_sport(u32 fl, u32 lqpn, u32 rqpn) { if (!fl) fl = rdma_calc_flow_label(lqpn, rqpn); return rdma_flow_label_to_udp_sport(fl); } const struct ib_port_immutable* ib_port_immutable_read(struct ib_device *dev, unsigned int port); #endif /* IB_VERBS_H */
68 65 3 65 53 53 53 45 45 287 259 78 78 9 78 541 541 541 541 541 84 77 2 2 2 77 79 5 5 79 77 2 75 2 77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 International Business Machines, Corp. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * These functions handle all input from the IP layer into SCTP. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Xingang Guo <xingang.guo@intel.com> * Jon Grimm <jgrimm@us.ibm.com> * Hui Huang <hui.huang@nokia.com> * Daisy Chang <daisyc@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> */ #include <linux/types.h> #include <linux/list.h> /* For struct list_head */ #include <linux/socket.h> #include <linux/ip.h> #include <linux/time.h> /* For struct timeval */ #include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/snmp.h> #include <net/sock.h> #include <net/xfrm.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/checksum.h> #include <net/net_namespace.h> #include <linux/rhashtable.h> #include <net/sock_reuseport.h> /* Forward declarations for internal helpers. */ static int sctp_rcv_ootb(struct sk_buff *); static struct sctp_association *__sctp_rcv_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *paddr, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif); static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, const union sctp_addr *daddr, int dif, int sdif); static struct sctp_association *__sctp_lookup_association( struct net *net, const union sctp_addr *local, const union sctp_addr *peer, struct sctp_transport **pt, int dif, int sdif); static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb); /* Calculate the SCTP checksum of an SCTP packet. */ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb) { struct sctphdr *sh = sctp_hdr(skb); __le32 cmp = sh->checksum; __le32 val = sctp_compute_cksum(skb, 0); if (val != cmp) { /* CRC failure, dump it. */ __SCTP_INC_STATS(net, SCTP_MIB_CHECKSUMERRORS); return -1; } return 0; } /* * This is the routine which IP calls when receiving an SCTP packet. */ int sctp_rcv(struct sk_buff *skb) { struct sock *sk; struct sctp_association *asoc; struct sctp_endpoint *ep = NULL; struct sctp_ep_common *rcvr; struct sctp_transport *transport = NULL; struct sctp_chunk *chunk; union sctp_addr src; union sctp_addr dest; int family; struct sctp_af *af; struct net *net = dev_net(skb->dev); bool is_gso = skb_is_gso(skb) && skb_is_gso_sctp(skb); int dif, sdif; if (skb->pkt_type != PACKET_HOST) goto discard_it; __SCTP_INC_STATS(net, SCTP_MIB_INSCTPPACKS); /* If packet is too small to contain a single chunk, let's not * waste time on it anymore. */ if (skb->len < sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + skb_transport_offset(skb)) goto discard_it; /* If the packet is fragmented and we need to do crc checking, * it's better to just linearize it otherwise crc computing * takes longer. */ if ((!is_gso && skb_linearize(skb)) || !pskb_may_pull(skb, sizeof(struct sctphdr))) goto discard_it; /* Pull up the IP header. */ __skb_pull(skb, skb_transport_offset(skb)); skb->csum_valid = 0; /* Previous value not applicable */ if (skb_csum_unnecessary(skb)) __skb_decr_checksum_unnecessary(skb); else if (!sctp_checksum_disable && !is_gso && sctp_rcv_checksum(net, skb) < 0) goto discard_it; skb->csum_valid = 1; __skb_pull(skb, sizeof(struct sctphdr)); family = ipver2af(ip_hdr(skb)->version); af = sctp_get_af_specific(family); if (unlikely(!af)) goto discard_it; SCTP_INPUT_CB(skb)->af = af; /* Initialize local addresses for lookups. */ af->from_skb(&src, skb, 1); af->from_skb(&dest, skb, 0); dif = af->skb_iif(skb); sdif = af->skb_sdif(skb); /* If the packet is to or from a non-unicast address, * silently discard the packet. * * This is not clearly defined in the RFC except in section * 8.4 - OOTB handling. However, based on the book "Stream Control * Transmission Protocol" 2.1, "It is important to note that the * IP address of an SCTP transport address must be a routable * unicast address. In other words, IP multicast addresses and * IP broadcast addresses cannot be used in an SCTP transport * address." */ if (!af->addr_valid(&src, NULL, skb) || !af->addr_valid(&dest, NULL, skb)) goto discard_it; asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport, dif, sdif); if (!asoc) ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src, dif, sdif); /* Retrieve the common input handling substructure. */ rcvr = asoc ? &asoc->base : &ep->base; sk = rcvr->sk; /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * An SCTP packet is called an "out of the blue" (OOTB) * packet if it is correctly formed, i.e., passed the * receiver's checksum check, but the receiver is not * able to identify the association to which this * packet belongs. */ if (!asoc) { if (sctp_rcv_ootb(skb)) { __SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES); goto discard_release; } } if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family)) goto discard_release; nf_reset_ct(skb); if (sk_filter(sk, skb)) goto discard_release; /* Create an SCTP packet structure. */ chunk = sctp_chunkify(skb, asoc, sk, GFP_ATOMIC); if (!chunk) goto discard_release; SCTP_INPUT_CB(skb)->chunk = chunk; /* Remember what endpoint is to handle this packet. */ chunk->rcvr = rcvr; /* Remember the SCTP header. */ chunk->sctp_hdr = sctp_hdr(skb); /* Set the source and destination addresses of the incoming chunk. */ sctp_init_addrs(chunk, &src, &dest); /* Remember where we came from. */ chunk->transport = transport; /* Acquire access to the sock lock. Note: We are safe from other * bottom halves on this lock, but a user may be in the lock too, * so check if it is busy. */ bh_lock_sock(sk); if (sk != rcvr->sk) { /* Our cached sk is different from the rcvr->sk. This is * because migrate()/accept() may have moved the association * to a new socket and released all the sockets. So now we * are holding a lock on the old socket while the user may * be doing something with the new socket. Switch our veiw * of the current sk. */ bh_unlock_sock(sk); sk = rcvr->sk; bh_lock_sock(sk); } if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sctp_add_backlog(sk, skb)) { bh_unlock_sock(sk); sctp_chunk_free(chunk); skb = NULL; /* sctp_chunk_free already freed the skb */ goto discard_release; } __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_BACKLOG); } else { __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_SOFTIRQ); sctp_inq_push(&chunk->rcvr->inqueue, chunk); } bh_unlock_sock(sk); /* Release the asoc/ep ref we took in the lookup calls. */ if (transport) sctp_transport_put(transport); else sctp_endpoint_put(ep); return 0; discard_it: __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_DISCARDS); kfree_skb(skb); return 0; discard_release: /* Release the asoc/ep ref we took in the lookup calls. */ if (transport) sctp_transport_put(transport); else sctp_endpoint_put(ep); goto discard_it; } /* Process the backlog queue of the socket. Every skb on * the backlog holds a ref on an association or endpoint. * We hold this ref throughout the state machine to make * sure that the structure we need is still around. */ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) { struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; struct sctp_inq *inqueue = &chunk->rcvr->inqueue; struct sctp_transport *t = chunk->transport; struct sctp_ep_common *rcvr = NULL; int backloged = 0; rcvr = chunk->rcvr; /* If the rcvr is dead then the association or endpoint * has been deleted and we can safely drop the chunk * and refs that we are holding. */ if (rcvr->dead) { sctp_chunk_free(chunk); goto done; } if (unlikely(rcvr->sk != sk)) { /* In this case, the association moved from one socket to * another. We are currently sitting on the backlog of the * old socket, so we need to move. * However, since we are here in the process context we * need to take make sure that the user doesn't own * the new socket when we process the packet. * If the new socket is user-owned, queue the chunk to the * backlog of the new socket without dropping any refs. * Otherwise, we can safely push the chunk on the inqueue. */ sk = rcvr->sk; local_bh_disable(); bh_lock_sock(sk); if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) sctp_chunk_free(chunk); else backloged = 1; } else sctp_inq_push(inqueue, chunk); bh_unlock_sock(sk); local_bh_enable(); /* If the chunk was backloged again, don't drop refs */ if (backloged) return 0; } else { if (!sctp_newsk_ready(sk)) { if (!sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) return 0; sctp_chunk_free(chunk); } else { sctp_inq_push(inqueue, chunk); } } done: /* Release the refs we took in sctp_add_backlog */ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) sctp_transport_put(t); else if (SCTP_EP_TYPE_SOCKET == rcvr->type) sctp_endpoint_put(sctp_ep(rcvr)); else BUG(); return 0; } static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) { struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; struct sctp_transport *t = chunk->transport; struct sctp_ep_common *rcvr = chunk->rcvr; int ret; ret = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)); if (!ret) { /* Hold the assoc/ep while hanging on the backlog queue. * This way, we know structures we need will not disappear * from us */ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) sctp_transport_hold(t); else if (SCTP_EP_TYPE_SOCKET == rcvr->type) sctp_endpoint_hold(sctp_ep(rcvr)); else BUG(); } return ret; } /* Handle icmp frag needed error. */ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t, __u32 pmtu) { if (!t || (t->pathmtu <= pmtu && t->pl.probe_size + sctp_transport_pl_hlen(t) <= pmtu)) return; if (sock_owned_by_user(sk)) { atomic_set(&t->mtu_info, pmtu); asoc->pmtu_pending = 1; t->pmtu_pending = 1; return; } if (!(t->param_flags & SPP_PMTUD_ENABLE)) /* We can't allow retransmitting in such case, as the * retransmission would be sized just as before, and thus we * would get another icmp, and retransmit again. */ return; /* Update transports view of the MTU. Return if no update was needed. * If an update wasn't needed/possible, it also doesn't make sense to * try to retransmit now. */ if (!sctp_transport_update_pmtu(t, pmtu)) return; /* Update association pmtu. */ sctp_assoc_sync_pmtu(asoc); /* Retransmit with the new pmtu setting. */ sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t, struct sk_buff *skb) { struct dst_entry *dst; if (sock_owned_by_user(sk) || !t) return; dst = sctp_transport_dst_check(t); if (dst) dst->ops->redirect(dst, sk, skb); } /* * SCTP Implementer's Guide, 2.37 ICMP handling procedures * * ICMP8) If the ICMP code is a "Unrecognized next header type encountered" * or a "Protocol Unreachable" treat this message as an abort * with the T bit set. * * This function sends an event to the state machine, which will abort the * association. * */ void sctp_icmp_proto_unreachable(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t) { if (sock_owned_by_user(sk)) { if (timer_pending(&t->proto_unreach_timer)) return; else { if (!mod_timer(&t->proto_unreach_timer, jiffies + (HZ/20))) sctp_transport_hold(t); } } else { struct net *net = sock_net(sk); pr_debug("%s: unrecognized next header type " "encountered!\n", __func__); if (del_timer(&t->proto_unreach_timer)) sctp_transport_put(t); sctp_do_sm(net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), asoc->state, asoc->ep, asoc, t, GFP_ATOMIC); } } /* Common lookup code for icmp/icmpv6 error handler. */ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, struct sctphdr *sctphdr, struct sctp_association **app, struct sctp_transport **tpp) { struct sctp_init_chunk *chunkhdr, _chunkhdr; union sctp_addr saddr; union sctp_addr daddr; struct sctp_af *af; struct sock *sk = NULL; struct sctp_association *asoc; struct sctp_transport *transport = NULL; __u32 vtag = ntohl(sctphdr->vtag); int sdif = inet_sdif(skb); int dif = inet_iif(skb); *app = NULL; *tpp = NULL; af = sctp_get_af_specific(family); if (unlikely(!af)) { return NULL; } /* Initialize local addresses for lookups. */ af->from_skb(&saddr, skb, 1); af->from_skb(&daddr, skb, 0); /* Look for an association that matches the incoming ICMP error * packet. */ asoc = __sctp_lookup_association(net, &saddr, &daddr, &transport, dif, sdif); if (!asoc) return NULL; sk = asoc->base.sk; /* RFC 4960, Appendix C. ICMP Handling * * ICMP6) An implementation MUST validate that the Verification Tag * contained in the ICMP message matches the Verification Tag of * the peer. If the Verification Tag is not 0 and does NOT * match, discard the ICMP message. If it is 0 and the ICMP * message contains enough bytes to verify that the chunk type is * an INIT chunk and that the Initiate Tag matches the tag of the * peer, continue with ICMP7. If the ICMP message is too short * or the chunk type or the Initiate Tag does not match, silently * discard the packet. */ if (vtag == 0) { /* chunk header + first 4 octects of init header */ chunkhdr = skb_header_pointer(skb, skb_transport_offset(skb) + sizeof(struct sctphdr), sizeof(struct sctp_chunkhdr) + sizeof(__be32), &_chunkhdr); if (!chunkhdr || chunkhdr->chunk_hdr.type != SCTP_CID_INIT || ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) goto out; } else if (vtag != asoc->c.peer_vtag) { goto out; } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. */ if (sock_owned_by_user(sk)) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); *app = asoc; *tpp = transport; return sk; out: sctp_transport_put(transport); return NULL; } /* Common cleanup code for icmp/icmpv6 error handler. */ void sctp_err_finish(struct sock *sk, struct sctp_transport *t) __releases(&((__sk)->sk_lock.slock)) { bh_unlock_sock(sk); sctp_transport_put(t); } static void sctp_v4_err_handle(struct sctp_transport *t, struct sk_buff *skb, __u8 type, __u8 code, __u32 info) { struct sctp_association *asoc = t->asoc; struct sock *sk = asoc->base.sk; int err = 0; switch (type) { case ICMP_PARAMETERPROB: err = EPROTO; break; case ICMP_DEST_UNREACH: if (code > NR_ICMP_UNREACH) return; if (code == ICMP_FRAG_NEEDED) { sctp_icmp_frag_needed(sk, asoc, t, SCTP_TRUNC4(info)); return; } if (code == ICMP_PROT_UNREACH) { sctp_icmp_proto_unreachable(sk, asoc, t); return; } err = icmp_err_convert[code].errno; break; case ICMP_TIME_EXCEEDED: if (code == ICMP_EXC_FRAGTIME) return; err = EHOSTUNREACH; break; case ICMP_REDIRECT: sctp_icmp_redirect(sk, t, skb); return; default: return; } if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { sk->sk_err = err; sk_error_report(sk); } else { /* Only an error on timeout */ WRITE_ONCE(sk->sk_err_soft, err); } } /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. After adjustment * header points to the first 8 bytes of the sctp header. We need * to find the appropriate port. * * The locking strategy used here is very "optimistic". When * someone else accesses the socket the ICMP is just dropped * and for some paths there is no check at all. * A more general error queue to queue errors for later handling * is probably better. * */ int sctp_v4_err(struct sk_buff *skb, __u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct net *net = dev_net(skb->dev); struct sctp_transport *transport; struct sctp_association *asoc; __u16 saveip, savesctp; struct sock *sk; /* Fix up skb to look at the embedded net header. */ saveip = skb->network_header; savesctp = skb->transport_header; skb_reset_network_header(skb); skb_set_transport_header(skb, iph->ihl * 4); sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &transport); /* Put back, the original values. */ skb->network_header = saveip; skb->transport_header = savesctp; if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; } sctp_v4_err_handle(transport, skb, type, code, info); sctp_err_finish(sk, transport); return 0; } int sctp_udp_v4_err(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sctp_association *asoc; struct sctp_transport *t; struct icmphdr *hdr; __u32 info = 0; skb->transport_header += sizeof(struct udphdr); sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &t); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; } skb->transport_header -= sizeof(struct udphdr); hdr = (struct icmphdr *)(skb_network_header(skb) - sizeof(struct icmphdr)); if (hdr->type == ICMP_REDIRECT) { /* can't be handled without outer iphdr known, leave it to udp_err */ sctp_err_finish(sk, t); return 0; } if (hdr->type == ICMP_DEST_UNREACH && hdr->code == ICMP_FRAG_NEEDED) info = ntohs(hdr->un.frag.mtu); sctp_v4_err_handle(t, skb, hdr->type, hdr->code, info); sctp_err_finish(sk, t); return 1; } /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * * This function scans all the chunks in the OOTB packet to determine if * the packet should be discarded right away. If a response might be needed * for this packet, or, if further processing is possible, the packet will * be queued to a proper inqueue for the next phase of handling. * * Output: * Return 0 - If further processing is needed. * Return 1 - If the packet can be discarded right away. */ static int sctp_rcv_ootb(struct sk_buff *skb) { struct sctp_chunkhdr *ch, _ch; int ch_end, offset = 0; /* Scan through all the chunks in the packet. */ do { /* Make sure we have at least the header there */ if (offset + sizeof(_ch) > skb->len) break; ch = skb_header_pointer(skb, offset, sizeof(*ch), &_ch); /* Break out if chunk length is less then minimal. */ if (!ch || ntohs(ch->length) < sizeof(_ch)) break; ch_end = offset + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb->len) break; /* RFC 8.4, 2) If the OOTB packet contains an ABORT chunk, the * receiver MUST silently discard the OOTB packet and take no * further action. */ if (SCTP_CID_ABORT == ch->type) goto discard; /* RFC 8.4, 6) If the packet contains a SHUTDOWN COMPLETE * chunk, the receiver should silently discard the packet * and take no further action. */ if (SCTP_CID_SHUTDOWN_COMPLETE == ch->type) goto discard; /* RFC 4460, 2.11.2 * This will discard packets with INIT chunk bundled as * subsequent chunks in the packet. When INIT is first, * the normal INIT processing will discard the chunk. */ if (SCTP_CID_INIT == ch->type && (void *)ch != skb->data) goto discard; offset = ch_end; } while (ch_end < skb->len); return 0; discard: return 1; } /* Insert endpoint into the hash table. */ static int __sctp_hash_endpoint(struct sctp_endpoint *ep) { struct sock *sk = ep->base.sk; struct net *net = sock_net(sk); struct sctp_hashbucket *head; ep->hashent = sctp_ep_hashfn(net, ep->base.bind_addr.port); head = &sctp_ep_hashtable[ep->hashent]; if (sk->sk_reuseport) { bool any = sctp_is_ep_boundall(sk); struct sctp_endpoint *ep2; struct list_head *list; int cnt = 0, err = 1; list_for_each(list, &ep->base.bind_addr.address_list) cnt++; sctp_for_each_hentry(ep2, &head->chain) { struct sock *sk2 = ep2->base.sk; if (!net_eq(sock_net(sk2), net) || sk2 == sk || !uid_eq(sock_i_uid(sk2), sock_i_uid(sk)) || !sk2->sk_reuseport) continue; err = sctp_bind_addrs_check(sctp_sk(sk2), sctp_sk(sk), cnt); if (!err) { err = reuseport_add_sock(sk, sk2, any); if (err) return err; break; } else if (err < 0) { return err; } } if (err) { err = reuseport_alloc(sk, any); if (err) return err; } } write_lock(&head->lock); hlist_add_head(&ep->node, &head->chain); write_unlock(&head->lock); return 0; } /* Add an endpoint to the hash. Local BH-safe. */ int sctp_hash_endpoint(struct sctp_endpoint *ep) { int err; local_bh_disable(); err = __sctp_hash_endpoint(ep); local_bh_enable(); return err; } /* Remove endpoint from the hash table. */ static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) { struct sock *sk = ep->base.sk; struct sctp_hashbucket *head; ep->hashent = sctp_ep_hashfn(sock_net(sk), ep->base.bind_addr.port); head = &sctp_ep_hashtable[ep->hashent]; if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); write_lock(&head->lock); hlist_del_init(&ep->node); write_unlock(&head->lock); } /* Remove endpoint from the hash. Local BH-safe. */ void sctp_unhash_endpoint(struct sctp_endpoint *ep) { local_bh_disable(); __sctp_unhash_endpoint(ep); local_bh_enable(); } static inline __u32 sctp_hashfn(const struct net *net, __be16 lport, const union sctp_addr *paddr, __u32 seed) { __u32 addr; if (paddr->sa.sa_family == AF_INET6) addr = jhash(&paddr->v6.sin6_addr, 16, seed); else addr = (__force __u32)paddr->v4.sin_addr.s_addr; return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | (__force __u32)lport, net_hash_mix(net), seed); } /* Look up an endpoint. */ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif) { struct sctp_hashbucket *head; struct sctp_endpoint *ep; struct sock *sk; __be16 lport; int hash; lport = laddr->v4.sin_port; hash = sctp_ep_hashfn(net, ntohs(lport)); head = &sctp_ep_hashtable[hash]; read_lock(&head->lock); sctp_for_each_hentry(ep, &head->chain) { if (sctp_endpoint_is_match(ep, net, laddr, dif, sdif)) goto hit; } ep = sctp_sk(net->sctp.ctl_sock)->ep; hit: sk = ep->base.sk; if (sk->sk_reuseport) { __u32 phash = sctp_hashfn(net, lport, paddr, 0); sk = reuseport_select_sock(sk, phash, skb, sizeof(struct sctphdr)); if (sk) ep = sctp_sk(sk)->ep; } sctp_endpoint_hold(ep); read_unlock(&head->lock); return ep; } /* rhashtable for transport */ struct sctp_hash_cmp_arg { const union sctp_addr *paddr; const struct net *net; __be16 lport; }; static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { struct sctp_transport *t = (struct sctp_transport *)ptr; const struct sctp_hash_cmp_arg *x = arg->key; int err = 1; if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr)) return err; if (!sctp_transport_hold(t)) return err; if (!net_eq(t->asoc->base.net, x->net)) goto out; if (x->lport != htons(t->asoc->base.bind_addr.port)) goto out; err = 0; out: sctp_transport_put(t); return err; } static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed) { const struct sctp_transport *t = data; return sctp_hashfn(t->asoc->base.net, htons(t->asoc->base.bind_addr.port), &t->ipaddr, seed); } static inline __u32 sctp_hash_key(const void *data, u32 len, u32 seed) { const struct sctp_hash_cmp_arg *x = data; return sctp_hashfn(x->net, x->lport, x->paddr, seed); } static const struct rhashtable_params sctp_hash_params = { .head_offset = offsetof(struct sctp_transport, node), .hashfn = sctp_hash_key, .obj_hashfn = sctp_hash_obj, .obj_cmpfn = sctp_hash_cmp, .automatic_shrinking = true, }; int sctp_transport_hashtable_init(void) { return rhltable_init(&sctp_transport_hashtable, &sctp_hash_params); } void sctp_transport_hashtable_destroy(void) { rhltable_destroy(&sctp_transport_hashtable); } int sctp_hash_transport(struct sctp_transport *t) { struct sctp_transport *transport; struct rhlist_head *tmp, *list; struct sctp_hash_cmp_arg arg; int err; if (t->asoc->temp) return 0; arg.net = t->asoc->base.net; arg.paddr = &t->ipaddr; arg.lport = htons(t->asoc->base.bind_addr.port); rcu_read_lock(); list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(transport, tmp, list, node) if (transport->asoc->ep == t->asoc->ep) { rcu_read_unlock(); return -EEXIST; } rcu_read_unlock(); err = rhltable_insert_key(&sctp_transport_hashtable, &arg, &t->node, sctp_hash_params); if (err) pr_err_once("insert transport fail, errno %d\n", err); return err; } void sctp_unhash_transport(struct sctp_transport *t) { if (t->asoc->temp) return; rhltable_remove(&sctp_transport_hashtable, &t->node, sctp_hash_params); } bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) { bool l3mdev_accept = true; #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) l3mdev_accept = !!READ_ONCE(net->sctp.l3mdev_accept); #endif return inet_bound_dev_eq(l3mdev_accept, bound_dev_if, dif, sdif); } /* return a transport with holding it */ struct sctp_transport *sctp_addrs_lookup_transport( struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif) { struct rhlist_head *tmp, *list; struct sctp_transport *t; int bound_dev_if; struct sctp_hash_cmp_arg arg = { .paddr = paddr, .net = net, .lport = laddr->v4.sin_port, }; list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(t, tmp, list, node) { if (!sctp_transport_hold(t)) continue; bound_dev_if = READ_ONCE(t->asoc->base.sk->sk_bound_dev_if); if (sctp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif) && sctp_bind_addr_match(&t->asoc->base.bind_addr, laddr, sctp_sk(t->asoc->base.sk))) return t; sctp_transport_put(t); } return NULL; } /* return a transport without holding it, as it's only used under sock lock */ struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr) { struct rhlist_head *tmp, *list; struct sctp_transport *t; struct sctp_hash_cmp_arg arg = { .paddr = paddr, .net = ep->base.net, .lport = htons(ep->base.bind_addr.port), }; list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(t, tmp, list, node) if (ep == t->asoc->ep) return t; return NULL; } /* Look up an association. */ static struct sctp_association *__sctp_lookup_association( struct net *net, const union sctp_addr *local, const union sctp_addr *peer, struct sctp_transport **pt, int dif, int sdif) { struct sctp_transport *t; struct sctp_association *asoc = NULL; t = sctp_addrs_lookup_transport(net, local, peer, dif, sdif); if (!t) goto out; asoc = t->asoc; *pt = t; out: return asoc; } /* Look up an association. protected by RCU read lock */ static struct sctp_association *sctp_lookup_association(struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc; rcu_read_lock(); asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); rcu_read_unlock(); return asoc; } /* Is there an association matching the given local and peer addresses? */ bool sctp_has_association(struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif) { struct sctp_transport *transport; if (sctp_lookup_association(net, laddr, paddr, &transport, dif, sdif)) { sctp_transport_put(transport); return true; } return false; } /* * SCTP Implementors Guide, 2.18 Handling of address * parameters within the INIT or INIT-ACK. * * D) When searching for a matching TCB upon reception of an INIT * or INIT-ACK chunk the receiver SHOULD use not only the * source address of the packet (containing the INIT or * INIT-ACK) but the receiver SHOULD also use all valid * address parameters contained within the chunk. * * 2.18.3 Solution description * * This new text clearly specifies to an implementor the need * to look within the INIT or INIT-ACK. Any implementation that * does not do this, may not be able to establish associations * in certain circumstances. * */ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc; union sctp_addr addr; union sctp_addr *paddr = &addr; struct sctphdr *sh = sctp_hdr(skb); union sctp_params params; struct sctp_init_chunk *init; struct sctp_af *af; /* * This code will NOT touch anything inside the chunk--it is * strictly READ-ONLY. * * RFC 2960 3 SCTP packet Format * * Multiple chunks can be bundled into one SCTP packet up to * the MTU size, except for the INIT, INIT ACK, and SHUTDOWN * COMPLETE chunks. These chunks MUST NOT be bundled with any * other chunk in a packet. See Section 6.10 for more details * on chunk bundling. */ /* Find the start of the TLVs and the end of the chunk. This is * the region we search for address parameters. */ init = (struct sctp_init_chunk *)skb->data; /* Walk the parameters looking for embedded addresses. */ sctp_walk_params(params, init) { /* Note: Ignoring hostname addresses. */ af = sctp_get_af_specific(param_type2af(params.p->type)); if (!af) continue; if (!af->from_addr_param(paddr, params.addr, sh->source, 0)) continue; asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); if (asoc) return asoc; } return NULL; } /* ADD-IP, Section 5.2 * When an endpoint receives an ASCONF Chunk from the remote peer * special procedures may be needed to identify the association the * ASCONF Chunk is associated with. To properly find the association * the following procedures SHOULD be followed: * * D2) If the association is not found, use the address found in the * Address Parameter TLV combined with the port number found in the * SCTP common header. If found proceed to rule D4. * * D2-ext) If more than one ASCONF Chunks are packed together, use the * address found in the ASCONF Address Parameter TLV of each of the * subsequent ASCONF Chunks. If found, proceed to rule D4. */ static struct sctp_association *__sctp_rcv_asconf_lookup( struct net *net, struct sctp_chunkhdr *ch, const union sctp_addr *laddr, __be16 peer_port, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_addip_chunk *asconf = (struct sctp_addip_chunk *)ch; struct sctp_af *af; union sctp_addr_param *param; union sctp_addr paddr; if (ntohs(ch->length) < sizeof(*asconf) + sizeof(struct sctp_paramhdr)) return NULL; /* Skip over the ADDIP header and find the Address parameter */ param = (union sctp_addr_param *)(asconf + 1); af = sctp_get_af_specific(param_type2af(param->p.type)); if (unlikely(!af)) return NULL; if (!af->from_addr_param(&paddr, param, peer_port, 0)) return NULL; return __sctp_lookup_association(net, laddr, &paddr, transportp, dif, sdif); } /* SCTP-AUTH, Section 6.3: * If the receiver does not find a STCB for a packet containing an AUTH * chunk as the first chunk and not a COOKIE-ECHO chunk as the second * chunk, it MUST use the chunks after the AUTH chunk to look up an existing * association. * * This means that any chunks that can help us identify the association need * to be looked at to find this association. */ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc = NULL; struct sctp_chunkhdr *ch; int have_auth = 0; unsigned int chunk_num = 1; __u8 *ch_end; /* Walk through the chunks looking for AUTH or ASCONF chunks * to help us find the association. */ ch = (struct sctp_chunkhdr *)skb->data; do { /* Break out if chunk length is less then minimal. */ if (ntohs(ch->length) < sizeof(*ch)) break; ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb_tail_pointer(skb)) break; switch (ch->type) { case SCTP_CID_AUTH: have_auth = chunk_num; break; case SCTP_CID_COOKIE_ECHO: /* If a packet arrives containing an AUTH chunk as * a first chunk, a COOKIE-ECHO chunk as the second * chunk, and possibly more chunks after them, and * the receiver does not have an STCB for that * packet, then authentication is based on * the contents of the COOKIE- ECHO chunk. */ if (have_auth == 1 && chunk_num == 2) return NULL; break; case SCTP_CID_ASCONF: if (have_auth || net->sctp.addip_noauth) asoc = __sctp_rcv_asconf_lookup( net, ch, laddr, sctp_hdr(skb)->source, transportp, dif, sdif); break; default: break; } if (asoc) break; ch = (struct sctp_chunkhdr *)ch_end; chunk_num++; } while (ch_end + sizeof(*ch) < skb_tail_pointer(skb)); return asoc; } /* * There are circumstances when we need to look inside the SCTP packet * for information to help us find the association. Examples * include looking inside of INIT/INIT-ACK chunks or after the AUTH * chunks. */ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_chunkhdr *ch; /* We do not allow GSO frames here as we need to linearize and * then cannot guarantee frame boundaries. This shouldn't be an * issue as packets hitting this are mostly INIT or INIT-ACK and * those cannot be on GSO-style anyway. */ if (skb_is_gso(skb) && skb_is_gso_sctp(skb)) return NULL; ch = (struct sctp_chunkhdr *)skb->data; /* The code below will attempt to walk the chunk and extract * parameter information. Before we do that, we need to verify * that the chunk length doesn't cause overflow. Otherwise, we'll * walk off the end. */ if (SCTP_PAD4(ntohs(ch->length)) > skb->len) return NULL; /* If this is INIT/INIT-ACK look inside the chunk too. */ if (ch->type == SCTP_CID_INIT || ch->type == SCTP_CID_INIT_ACK) return __sctp_rcv_init_lookup(net, skb, laddr, transportp, dif, sdif); return __sctp_rcv_walk_lookup(net, skb, laddr, transportp, dif, sdif); } /* Lookup an association for an inbound skb. */ static struct sctp_association *__sctp_rcv_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *paddr, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc; asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); if (asoc) goto out; /* Further lookup for INIT/INIT-ACK packets. * SCTP Implementors Guide, 2.18 Handling of address * parameters within the INIT or INIT-ACK. */ asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp, dif, sdif); if (asoc) goto out; if (paddr->sa.sa_family == AF_INET) pr_debug("sctp: asoc not found for src:%pI4:%d dst:%pI4:%d\n", &laddr->v4.sin_addr, ntohs(laddr->v4.sin_port), &paddr->v4.sin_addr, ntohs(paddr->v4.sin_port)); else pr_debug("sctp: asoc not found for src:%pI6:%d dst:%pI6:%d\n", &laddr->v6.sin6_addr, ntohs(laddr->v6.sin6_port), &paddr->v6.sin6_addr, ntohs(paddr->v6.sin6_port)); out: return asoc; }
182 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2019, Tessares SA. */ #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/net_namespace.h> #include <net/netns/generic.h> #include "protocol.h" #define MPTCP_SYSCTL_PATH "net/mptcp" static int mptcp_pernet_id; #ifdef CONFIG_SYSCTL static int mptcp_pm_type_max = __MPTCP_PM_TYPE_MAX; #endif struct mptcp_pernet { #ifdef CONFIG_SYSCTL struct ctl_table_header *ctl_table_hdr; #endif unsigned int add_addr_timeout; unsigned int stale_loss_cnt; u8 mptcp_enabled; u8 checksum_enabled; u8 allow_join_initial_addr_port; u8 pm_type; char scheduler[MPTCP_SCHED_NAME_MAX]; }; static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) { return net_generic(net, mptcp_pernet_id); } int mptcp_is_enabled(const struct net *net) { return mptcp_get_pernet(net)->mptcp_enabled; } unsigned int mptcp_get_add_addr_timeout(const struct net *net) { return mptcp_get_pernet(net)->add_addr_timeout; } int mptcp_is_checksum_enabled(const struct net *net) { return mptcp_get_pernet(net)->checksum_enabled; } int mptcp_allow_join_id0(const struct net *net) { return mptcp_get_pernet(net)->allow_join_initial_addr_port; } unsigned int mptcp_stale_loss_cnt(const struct net *net) { return mptcp_get_pernet(net)->stale_loss_cnt; } int mptcp_get_pm_type(const struct net *net) { return mptcp_get_pernet(net)->pm_type; } const char *mptcp_get_scheduler(const struct net *net) { return mptcp_get_pernet(net)->scheduler; } static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; pernet->add_addr_timeout = TCP_RTO_MAX; pernet->checksum_enabled = 0; pernet->allow_join_initial_addr_port = 1; pernet->stale_loss_cnt = 4; pernet->pm_type = MPTCP_PM_TYPE_KERNEL; strcpy(pernet->scheduler, "default"); } #ifdef CONFIG_SYSCTL static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", .maxlen = sizeof(u8), .mode = 0644, /* users with CAP_NET_ADMIN or root (not and) can change this * value, same as other sysctl or the 'net' tree. */ .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "add_addr_timeout", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "checksum_enabled", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "allow_join_initial_addr_port", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, { .procname = "stale_loss_cnt", .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_douintvec_minmax, }, { .procname = "pm_type", .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &mptcp_pm_type_max }, { .procname = "scheduler", .maxlen = MPTCP_SCHED_NAME_MAX, .mode = 0644, .proc_handler = proc_dostring, }, {} }; static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) { struct ctl_table_header *hdr; struct ctl_table *table; table = mptcp_sysctl_table; if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(mptcp_sysctl_table), GFP_KERNEL); if (!table) goto err_alloc; } table[0].data = &pernet->mptcp_enabled; table[1].data = &pernet->add_addr_timeout; table[2].data = &pernet->checksum_enabled; table[3].data = &pernet->allow_join_initial_addr_port; table[4].data = &pernet->stale_loss_cnt; table[5].data = &pernet->pm_type; table[6].data = &pernet->scheduler; hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table, ARRAY_SIZE(mptcp_sysctl_table)); if (!hdr) goto err_reg; pernet->ctl_table_hdr = hdr; return 0; err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) { struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg; unregister_net_sysctl_table(pernet->ctl_table_hdr); kfree(table); } #else static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) { return 0; } static void mptcp_pernet_del_table(struct mptcp_pernet *pernet) {} #endif /* CONFIG_SYSCTL */ static int __net_init mptcp_net_init(struct net *net) { struct mptcp_pernet *pernet = mptcp_get_pernet(net); mptcp_pernet_set_defaults(pernet); return mptcp_pernet_new_table(net, pernet); } /* Note: the callback will only be called per extra netns */ static void __net_exit mptcp_net_exit(struct net *net) { struct mptcp_pernet *pernet = mptcp_get_pernet(net); mptcp_pernet_del_table(pernet); } static struct pernet_operations mptcp_pernet_ops = { .init = mptcp_net_init, .exit = mptcp_net_exit, .id = &mptcp_pernet_id, .size = sizeof(struct mptcp_pernet), }; void __init mptcp_init(void) { mptcp_join_cookie_init(); mptcp_proto_init(); if (register_pernet_subsys(&mptcp_pernet_ops) < 0) panic("Failed to register MPTCP pernet subsystem.\n"); } #if IS_ENABLED(CONFIG_MPTCP_IPV6) int __init mptcpv6_init(void) { int err; err = mptcp_proto_v6_init(); return err; } #endif
1202 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 // SPDX-License-Identifier: GPL-2.0 /* * Dynamic byte queue limits. See include/linux/dynamic_queue_limits.h * * Copyright (c) 2011, Tom Herbert <therbert@google.com> */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/dynamic_queue_limits.h> #include <linux/compiler.h> #include <linux/export.h> #define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0) #define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0) /* Records completed count and recalculates the queue limit */ void dql_completed(struct dql *dql, unsigned int count) { unsigned int inprogress, prev_inprogress, limit; unsigned int ovlimit, completed, num_queued; bool all_prev_completed; num_queued = READ_ONCE(dql->num_queued); /* Can't complete more than what's in queue */ BUG_ON(count > num_queued - dql->num_completed); completed = dql->num_completed + count; limit = dql->limit; ovlimit = POSDIFF(num_queued - dql->num_completed, limit); inprogress = num_queued - completed; prev_inprogress = dql->prev_num_queued - dql->num_completed; all_prev_completed = AFTER_EQ(completed, dql->prev_num_queued); if ((ovlimit && !inprogress) || (dql->prev_ovlimit && all_prev_completed)) { /* * Queue considered starved if: * - The queue was over-limit in the last interval, * and there is no more data in the queue. * OR * - The queue was over-limit in the previous interval and * when enqueuing it was possible that all queued data * had been consumed. This covers the case when queue * may have becomes starved between completion processing * running and next time enqueue was scheduled. * * When queue is starved increase the limit by the amount * of bytes both sent and completed in the last interval, * plus any previous over-limit. */ limit += POSDIFF(completed, dql->prev_num_queued) + dql->prev_ovlimit; dql->slack_start_time = jiffies; dql->lowest_slack = UINT_MAX; } else if (inprogress && prev_inprogress && !all_prev_completed) { /* * Queue was not starved, check if the limit can be decreased. * A decrease is only considered if the queue has been busy in * the whole interval (the check above). * * If there is slack, the amount of excess data queued above * the amount needed to prevent starvation, the queue limit * can be decreased. To avoid hysteresis we consider the * minimum amount of slack found over several iterations of the * completion routine. */ unsigned int slack, slack_last_objs; /* * Slack is the maximum of * - The queue limit plus previous over-limit minus twice * the number of objects completed. Note that two times * number of completed bytes is a basis for an upper bound * of the limit. * - Portion of objects in the last queuing operation that * was not part of non-zero previous over-limit. That is * "round down" by non-overlimit portion of the last * queueing operation. */ slack = POSDIFF(limit + dql->prev_ovlimit, 2 * (completed - dql->num_completed)); slack_last_objs = dql->prev_ovlimit ? POSDIFF(dql->prev_last_obj_cnt, dql->prev_ovlimit) : 0; slack = max(slack, slack_last_objs); if (slack < dql->lowest_slack) dql->lowest_slack = slack; if (time_after(jiffies, dql->slack_start_time + dql->slack_hold_time)) { limit = POSDIFF(limit, dql->lowest_slack); dql->slack_start_time = jiffies; dql->lowest_slack = UINT_MAX; } } /* Enforce bounds on limit */ limit = clamp(limit, dql->min_limit, dql->max_limit); if (limit != dql->limit) { dql->limit = limit; ovlimit = 0; } dql->adj_limit = limit + completed; dql->prev_ovlimit = ovlimit; dql->prev_last_obj_cnt = dql->last_obj_cnt; dql->num_completed = completed; dql->prev_num_queued = num_queued; } EXPORT_SYMBOL(dql_completed); void dql_reset(struct dql *dql) { /* Reset all dynamic values */ dql->limit = 0; dql->num_queued = 0; dql->num_completed = 0; dql->last_obj_cnt = 0; dql->prev_num_queued = 0; dql->prev_last_obj_cnt = 0; dql->prev_ovlimit = 0; dql->lowest_slack = UINT_MAX; dql->slack_start_time = jiffies; } EXPORT_SYMBOL(dql_reset); void dql_init(struct dql *dql, unsigned int hold_time) { dql->max_limit = DQL_MAX_LIMIT; dql->min_limit = 0; dql->slack_hold_time = hold_time; dql_reset(dql); } EXPORT_SYMBOL(dql_init);
334 1 4 4 4 4 11 4 3 4 4 4 4 4 4 4 4 11 109 111 109 106 11 104 94 16 15 13 1 12 14 14 12 111 2 12 1 1 1 11 10 1 10 5 4 3 3 8 7 6 6 6 47 45 45 45 42 3 2 1 5 4 4 3 2 5 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 // SPDX-License-Identifier: GPL-2.0-or-later /* * common UDP/RAW code * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/interrupt.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/route.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/icmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/tcp_states.h> #include <net/dsfield.h> #include <net/sock_reuseport.h> #include <linux/errqueue.h> #include <linux/uaccess.h> static bool ipv6_mapped_addr_any(const struct in6_addr *a) { return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); } static void ip6_datagram_flow_key_init(struct flowi6 *fl6, const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); const struct ipv6_pinfo *np = inet6_sk(sk); int oif = sk->sk_bound_dev_if; memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = sk->sk_protocol; fl6->daddr = sk->sk_v6_daddr; fl6->saddr = np->saddr; fl6->flowi6_mark = sk->sk_mark; fl6->fl6_dport = inet->inet_dport; fl6->fl6_sport = inet->inet_sport; fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6->flowi6_uid = sk->sk_uid; if (!oif) oif = np->sticky_pktinfo.ipi6_ifindex; if (!oif) { if (ipv6_addr_is_multicast(&fl6->daddr)) oif = np->mcast_oif; else oif = np->ucast_oif; } fl6->flowi6_oif = oif; security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) { struct ip6_flowlabel *flowlabel = NULL; struct in6_addr *final_p, final; struct ipv6_txoptions *opt; struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 fl6; int err = 0; if (inet6_test_bit(SNDFLOW, sk) && (np->flow_label & IPV6_FLOWLABEL_MASK)) { flowlabel = fl6_sock_lookup(sk, np->flow_label); if (IS_ERR(flowlabel)) return -EINVAL; } ip6_datagram_flow_key_init(&fl6, sk); rcu_read_lock(); opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt); final_p = fl6_update_dst(&fl6, opt, &final); rcu_read_unlock(); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; } if (fix_sk_saddr) { if (ipv6_addr_any(&np->saddr)) np->saddr = fl6.saddr; if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { sk->sk_v6_rcv_saddr = fl6.saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } } ip6_sk_dst_store_flow(sk, dst, &fl6); out: fl6_sock_release(flowlabel); return err; } void ip6_datagram_release_cb(struct sock *sk) { struct dst_entry *dst; if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) return; rcu_read_lock(); dst = __sk_dst_get(sk); if (!dst || !dst->obsolete || dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) { rcu_read_unlock(); return; } rcu_read_unlock(); ip6_datagram_dst_update(sk, false); } EXPORT_SYMBOL_GPL(ip6_datagram_release_cb); int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *daddr, old_daddr; __be32 fl6_flowlabel = 0; __be32 old_fl6_flowlabel; __be16 old_dport; int addr_type; int err; if (usin->sin6_family == AF_INET) { if (ipv6_only_sock(sk)) return -EAFNOSUPPORT; err = __ip4_datagram_connect(sk, uaddr, addr_len); goto ipv4_connected; } if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; if (inet6_test_bit(SNDFLOW, sk)) fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; if (ipv6_addr_any(&usin->sin6_addr)) { /* * connect to self */ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), &usin->sin6_addr); else usin->sin6_addr = in6addr_loopback; } addr_type = ipv6_addr_type(&usin->sin6_addr); daddr = &usin->sin6_addr; if (addr_type & IPV6_ADDR_MAPPED) { struct sockaddr_in sin; if (ipv6_only_sock(sk)) { err = -ENETUNREACH; goto out; } sin.sin_family = AF_INET; sin.sin_addr.s_addr = daddr->s6_addr32[3]; sin.sin_port = usin->sin6_port; err = __ip4_datagram_connect(sk, (struct sockaddr *) &sin, sizeof(sin)); ipv4_connected: if (err) goto out; ipv6_addr_set_v4mapped(inet->inet_daddr, &sk->sk_v6_daddr); if (ipv6_addr_any(&np->saddr) || ipv6_mapped_addr_any(&np->saddr)) ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr); if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) || ipv6_mapped_addr_any(&sk->sk_v6_rcv_saddr)) { ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &sk->sk_v6_rcv_saddr); if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } goto out; } if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) { err = -EINVAL; goto out; } WRITE_ONCE(sk->sk_bound_dev_if, usin->sin6_scope_id); } if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST)) WRITE_ONCE(sk->sk_bound_dev_if, np->mcast_oif); /* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) { err = -EINVAL; goto out; } } /* save the current peer information before updating it */ old_daddr = sk->sk_v6_daddr; old_fl6_flowlabel = np->flow_label; old_dport = inet->inet_dport; sk->sk_v6_daddr = *daddr; np->flow_label = fl6_flowlabel; inet->inet_dport = usin->sin6_port; /* * Check for a route to destination an obtain the * destination cache for it. */ err = ip6_datagram_dst_update(sk, true); if (err) { /* Restore the socket peer info, to keep it consistent with * the old socket state */ sk->sk_v6_daddr = old_daddr; np->flow_label = old_fl6_flowlabel; inet->inet_dport = old_dport; goto out; } reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); out: return err; } EXPORT_SYMBOL_GPL(__ip6_datagram_connect); int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { int res; lock_sock(sk); res = __ip6_datagram_connect(sk, uaddr, addr_len); release_sock(sk); return res; } EXPORT_SYMBOL_GPL(ip6_datagram_connect); int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, int addr_len) { DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr); if (sin6->sin6_family != AF_INET6) return -EAFNOSUPPORT; return ip6_datagram_connect(sk, uaddr, addr_len); } EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only); static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out) { switch (icmp6_hdr(skb)->icmp6_type) { case ICMPV6_TIME_EXCEED: case ICMPV6_DEST_UNREACH: ip_icmp_error_rfc4884(skb, out, sizeof(struct icmp6hdr), icmp6_hdr(skb)->icmp6_datagram_len * 8); } } void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload) { struct icmp6hdr *icmph = icmp6_hdr(skb); struct sock_exterr_skb *serr; if (!inet6_test_bit(RECVERR6, sk)) return; skb = skb_clone(skb, GFP_ATOMIC); if (!skb) return; skb->protocol = htons(ETH_P_IPV6); serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6; serr->ee.ee_type = icmph->icmp6_type; serr->ee.ee_code = icmph->icmp6_code; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&(((struct ipv6hdr *)(icmph + 1))->daddr) - skb_network_header(skb); serr->port = port; __skb_pull(skb, payload - skb->data); if (inet6_test_bit(RECVERR6_RFC4884, sk)) ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); } EXPORT_SYMBOL_GPL(ipv6_icmp_error); void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) { struct sock_exterr_skb *serr; struct ipv6hdr *iph; struct sk_buff *skb; if (!inet6_test_bit(RECVERR6, sk)) return; skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb) return; skb->protocol = htons(ETH_P_IPV6); skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); iph = ipv6_hdr(skb); iph->daddr = fl6->daddr; ip6_flow_hdr(iph, 0, 0); serr = SKB_EXT_ERR(skb); serr->ee.ee_errno = err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_type = 0; serr->ee.ee_code = 0; serr->ee.ee_pad = 0; serr->ee.ee_info = info; serr->ee.ee_data = 0; serr->addr_offset = (u8 *)&iph->daddr - skb_network_header(skb); serr->port = fl6->fl6_dport; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); } void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6hdr *iph; struct sk_buff *skb; struct ip6_mtuinfo *mtu_info; if (!np->rxopt.bits.rxpmtu) return; skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb) return; skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); iph = ipv6_hdr(skb); iph->daddr = fl6->daddr; mtu_info = IP6CBMTU(skb); mtu_info->ip6m_mtu = mtu; mtu_info->ip6m_addr.sin6_family = AF_INET6; mtu_info->ip6m_addr.sin6_port = 0; mtu_info->ip6m_addr.sin6_flowinfo = 0; mtu_info->ip6m_addr.sin6_scope_id = fl6->flowi6_oif; mtu_info->ip6m_addr.sin6_addr = ipv6_hdr(skb)->daddr; __skb_pull(skb, skb_tail_pointer(skb) - skb->data); skb_reset_transport_header(skb); skb = xchg(&np->rxpmtu, skb); kfree_skb(skb); } /* For some errors we have valid addr_offset even with zero payload and * zero port. Also, addr_offset should be supported if port is set. */ static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr) { return serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 || serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL || serr->port; } /* IPv6 supports cmsg on all origins aside from SO_EE_ORIGIN_LOCAL. * * At one point, excluding local errors was a quick test to identify icmp/icmp6 * errors. This is no longer true, but the test remained, so the v6 stack, * unlike v4, also honors cmsg requests on all wifi and timestamp errors. */ static bool ip6_datagram_support_cmsg(struct sk_buff *skb, struct sock_exterr_skb *serr) { if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) return true; if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) return false; if (!IP6CB(skb)->iif) return false; return true; } /* * Handle MSG_ERRQUEUE */ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sock_exterr_skb *serr; struct sk_buff *skb; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); struct { struct sock_extended_err ee; struct sockaddr_in6 offender; } errhdr; int err; int copied; err = -EAGAIN; skb = sock_dequeue_err_skb(sk); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (unlikely(err)) { kfree_skb(skb); return err; } sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); if (sin && ipv6_datagram_support_addr(serr)) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = serr->port; if (skb->protocol == htons(ETH_P_IPV6)) { const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset), struct ipv6hdr, daddr); sin->sin6_addr = ip6h->daddr; if (inet6_test_bit(SNDFLOW, sk)) sin->sin6_flowinfo = ip6_flowinfo(ip6h); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, IP6CB(skb)->iif); } else { ipv6_addr_set_v4mapped(*(__be32 *)(nh + serr->addr_offset), &sin->sin6_addr); sin->sin6_scope_id = 0; } *addr_len = sizeof(*sin); } memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); if (ip6_datagram_support_cmsg(skb, serr)) { sin->sin6_family = AF_INET6; if (np->rxopt.all) ip6_datagram_recv_common_ctl(sk, msg, skb); if (skb->protocol == htons(ETH_P_IPV6)) { sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) ip6_datagram_recv_specific_ctl(sk, msg, skb); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, IP6CB(skb)->iif); } else { ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &sin->sin6_addr); if (inet_cmsg_flags(inet_sk(sk))) ip_cmsg_recv(msg, skb); } } put_cmsg(msg, SOL_IPV6, IPV6_RECVERR, sizeof(errhdr), &errhdr); /* Now we could try to dump offended packet options */ msg->msg_flags |= MSG_ERRQUEUE; err = copied; consume_skb(skb); out: return err; } EXPORT_SYMBOL_GPL(ipv6_recv_error); /* * Handle IPV6_RECVPATHMTU */ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff *skb; struct ip6_mtuinfo mtu_info; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin, msg->msg_name); int err; int copied; err = -EAGAIN; skb = xchg(&np->rxpmtu, NULL); if (!skb) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; sock_recv_timestamp(msg, sk, skb); memcpy(&mtu_info, IP6CBMTU(skb), sizeof(mtu_info)); if (sin) { sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_port = 0; sin->sin6_scope_id = mtu_info.ip6m_addr.sin6_scope_id; sin->sin6_addr = mtu_info.ip6m_addr.sin6_addr; *addr_len = sizeof(*sin); } put_cmsg(msg, SOL_IPV6, IPV6_PATHMTU, sizeof(mtu_info), &mtu_info); err = copied; out_free_skb: kfree_skb(skb); out: return err; } void ip6_datagram_recv_common_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); bool is_ipv6 = skb->protocol == htons(ETH_P_IPV6); if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; if (is_ipv6) { src_info.ipi6_ifindex = IP6CB(skb)->iif; src_info.ipi6_addr = ipv6_hdr(skb)->daddr; } else { src_info.ipi6_ifindex = PKTINFO_SKB_CB(skb)->ipi_ifindex; ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, &src_info.ipi6_addr); } if (src_info.ipi6_ifindex >= 0) put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } } void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); struct inet6_skb_parm *opt = IP6CB(skb); unsigned char *nh = skb_network_header(skb); if (np->rxopt.bits.rxhlim) { int hlim = ipv6_hdr(skb)->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { int tclass = ipv6_get_dsfield(ipv6_hdr(skb)); put_cmsg(msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxflow) { __be32 flowinfo = ip6_flowinfo((struct ipv6hdr *)nh); if (flowinfo) put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } /* HbH is allowed only once */ if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) { u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); } if (opt->lastopt && (np->rxopt.bits.dstopts || np->rxopt.bits.srcrt)) { /* * Silly enough, but we need to reparse in order to * report extension headers (except for HbH) * in order. * * Also note that IPV6_RECVRTHDRDSTOPTS is NOT * (and WILL NOT be) defined because * IPV6_RECVDSTOPTS is more generic. --yoshfuji */ unsigned int off = sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; while (off <= opt->lastopt) { unsigned int len; u8 *ptr = nh + off; switch (nexthdr) { case IPPROTO_DSTOPTS: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; if (np->rxopt.bits.dstopts) put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, len, ptr); break; case IPPROTO_ROUTING: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; if (np->rxopt.bits.srcrt) put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, len, ptr); break; case IPPROTO_AH: nexthdr = ptr[0]; len = (ptr[1] + 2) << 2; break; default: nexthdr = ptr[0]; len = (ptr[1] + 1) << 3; break; } off += len; } } /* socket options in old style */ if (np->rxopt.bits.rxoinfo) { struct in6_pktinfo src_info; src_info.ipi6_ifindex = opt->iif; src_info.ipi6_addr = ipv6_hdr(skb)->daddr; put_cmsg(msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { int hlim = ipv6_hdr(skb)->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) { u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.odstopts && opt->dst0) { u8 *ptr = nh + opt->dst0; put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.osrcrt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(nh + opt->srcrt); put_cmsg(msg, SOL_IPV6, IPV6_2292RTHDR, (rthdr->hdrlen+1) << 3, rthdr); } if (np->rxopt.bits.odstopts && opt->dst1) { u8 *ptr = nh + opt->dst1; put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.rxorigdstaddr) { struct sockaddr_in6 sin6; __be16 _ports[2], *ports; ports = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_ports), &_ports); if (ports) { /* All current transport protocols have the port numbers in the * first four bytes of the transport header and this function is * written with this assumption in mind. */ sin6.sin6_family = AF_INET6; sin6.sin6_addr = ipv6_hdr(skb)->daddr; sin6.sin6_port = ports[1]; sin6.sin6_flowinfo = 0; sin6.sin6_scope_id = ipv6_iface_scope_id(&ipv6_hdr(skb)->daddr, opt->iif); put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6); } } if (np->rxopt.bits.recvfragsize && opt->frag_max_size) { int val = opt->frag_max_size; put_cmsg(msg, SOL_IPV6, IPV6_RECVFRAGSIZE, sizeof(val), &val); } } void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { ip6_datagram_recv_common_ctl(sk, msg, skb); ip6_datagram_recv_specific_ctl(sk, msg, skb); } EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl); int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, struct flowi6 *fl6, struct ipcm6_cookie *ipc6) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; struct ipv6_rt_hdr *rthdr; struct ipv6_opt_hdr *hdr; struct ipv6_txoptions *opt = ipc6->opt; int len; int err = 0; for_each_cmsghdr(cmsg, msg) { int addr_type; if (!CMSG_OK(msg, cmsg)) { err = -EINVAL; goto exit_f; } if (cmsg->cmsg_level == SOL_SOCKET) { err = __sock_cmsg_send(sk, cmsg, &ipc6->sockc); if (err) return err; continue; } if (cmsg->cmsg_level != SOL_IPV6) continue; switch (cmsg->cmsg_type) { case IPV6_PKTINFO: case IPV6_2292PKTINFO: { struct net_device *dev = NULL; int src_idx; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { err = -EINVAL; goto exit_f; } src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); src_idx = src_info->ipi6_ifindex; if (src_idx) { if (fl6->flowi6_oif && src_idx != fl6->flowi6_oif && (READ_ONCE(sk->sk_bound_dev_if) != fl6->flowi6_oif || !sk_dev_equal_l3scope(sk, src_idx))) return -EINVAL; fl6->flowi6_oif = src_idx; } addr_type = __ipv6_addr_type(&src_info->ipi6_addr); rcu_read_lock(); if (fl6->flowi6_oif) { dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); if (!dev) { rcu_read_unlock(); return -ENODEV; } } else if (addr_type & IPV6_ADDR_LINKLOCAL) { rcu_read_unlock(); return -EINVAL; } if (addr_type != IPV6_ADDR_ANY) { int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) && !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr, dev, !strict, 0, IFA_F_TENTATIVE) && !ipv6_chk_acast_addr_src(net, dev, &src_info->ipi6_addr)) err = -EINVAL; else fl6->saddr = src_info->ipi6_addr; } rcu_read_unlock(); if (err) goto exit_f; break; } case IPV6_FLOWINFO: if (cmsg->cmsg_len < CMSG_LEN(4)) { err = -EINVAL; goto exit_f; } if (fl6->flowlabel&IPV6_FLOWINFO_MASK) { if ((fl6->flowlabel^*(__be32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) { err = -EINVAL; goto exit_f; } } fl6->flowlabel = IPV6_FLOWINFO_MASK & *(__be32 *)CMSG_DATA(cmsg); break; case IPV6_2292HOPOPTS: case IPV6_HOPOPTS: if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { err = -EINVAL; goto exit_f; } hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); len = ((hdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } opt->opt_nflen += len; opt->hopopt = hdr; break; case IPV6_2292DSTOPTS: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { err = -EINVAL; goto exit_f; } hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); len = ((hdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } if (opt->dst1opt) { err = -EINVAL; goto exit_f; } opt->opt_flen += len; opt->dst1opt = hdr; break; case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { err = -EINVAL; goto exit_f; } hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); len = ((hdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } if (!ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; goto exit_f; } if (cmsg->cmsg_type == IPV6_DSTOPTS) { opt->opt_flen += len; opt->dst1opt = hdr; } else { opt->opt_nflen += len; opt->dst0opt = hdr; } break; case IPV6_2292RTHDR: case IPV6_RTHDR: if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) { err = -EINVAL; goto exit_f; } rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg); switch (rthdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) { err = -EINVAL; goto exit_f; } break; #endif default: err = -EINVAL; goto exit_f; } len = ((rthdr->hdrlen + 1) << 3); if (cmsg->cmsg_len < CMSG_LEN(len)) { err = -EINVAL; goto exit_f; } /* segments left must also match */ if ((rthdr->hdrlen >> 1) != rthdr->segments_left) { err = -EINVAL; goto exit_f; } opt->opt_nflen += len; opt->srcrt = rthdr; if (cmsg->cmsg_type == IPV6_2292RTHDR && opt->dst1opt) { int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3); opt->opt_nflen += dsthdrlen; opt->dst0opt = opt->dst1opt; opt->dst1opt = NULL; opt->opt_flen -= dsthdrlen; } break; case IPV6_2292HOPLIMIT: case IPV6_HOPLIMIT: if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { err = -EINVAL; goto exit_f; } ipc6->hlimit = *(int *)CMSG_DATA(cmsg); if (ipc6->hlimit < -1 || ipc6->hlimit > 0xff) { err = -EINVAL; goto exit_f; } break; case IPV6_TCLASS: { int tc; err = -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) goto exit_f; tc = *(int *)CMSG_DATA(cmsg); if (tc < -1 || tc > 0xff) goto exit_f; err = 0; ipc6->tclass = tc; break; } case IPV6_DONTFRAG: { int df; err = -EINVAL; if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) goto exit_f; df = *(int *)CMSG_DATA(cmsg); if (df < 0 || df > 1) goto exit_f; err = 0; ipc6->dontfrag = df; break; } default: net_dbg_ratelimited("invalid cmsg type: %d\n", cmsg->cmsg_type); err = -EINVAL; goto exit_f; } } exit_f: return err; } EXPORT_SYMBOL_GPL(ip6_datagram_send_ctl); void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, __u16 destp, int rqueue, int bucket) { const struct in6_addr *dest, *src; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; seq_printf(seq, "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n", bucket, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, sp->sk_state, sk_wmem_alloc_get(sp), rqueue, 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SPECIAL_INSNS_H #define _ASM_X86_SPECIAL_INSNS_H #ifdef __KERNEL__ #include <asm/nops.h> #include <asm/processor-flags.h> #include <linux/irqflags.h> #include <linux/jump_label.h> /* * The compiler should not reorder volatile asm statements with respect to each * other: they should execute in program order. However GCC 4.9.x and 5.x have * a bug (which was fixed in 8.1, 7.3 and 6.5) where they might reorder * volatile asm. The write functions are not affected since they have memory * clobbers preventing reordering. To prevent reads from being reordered with * respect to writes, use a dummy memory operand. */ #define __FORCE_ORDER "m"(*(unsigned int *)0x1000UL) void native_write_cr0(unsigned long val); static inline unsigned long native_read_cr0(void) { unsigned long val; asm volatile("mov %%cr0,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static __always_inline unsigned long native_read_cr2(void) { unsigned long val; asm volatile("mov %%cr2,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static __always_inline void native_write_cr2(unsigned long val) { asm volatile("mov %0,%%cr2": : "r" (val) : "memory"); } static inline unsigned long __native_read_cr3(void) { unsigned long val; asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static inline void native_write_cr3(unsigned long val) { asm volatile("mov %0,%%cr3": : "r" (val) : "memory"); } static inline unsigned long native_read_cr4(void) { unsigned long val; #ifdef CONFIG_X86_32 /* * This could fault if CR4 does not exist. Non-existent CR4 * is functionally equivalent to CR4 == 0. Keep it simple and pretend * that CR4 == 0 on CPUs that don't have CR4. */ asm volatile("1: mov %%cr4, %0\n" "2:\n" _ASM_EXTABLE(1b, 2b) : "=r" (val) : "0" (0), __FORCE_ORDER); #else /* CR4 always exists on x86_64. */ asm volatile("mov %%cr4,%0\n\t" : "=r" (val) : __FORCE_ORDER); #endif return val; } void native_write_cr4(unsigned long val); #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS static inline u32 rdpkru(void) { u32 ecx = 0; u32 edx, pkru; /* * "rdpkru" instruction. Places PKRU contents in to EAX, * clears EDX and requires that ecx=0. */ asm volatile(".byte 0x0f,0x01,0xee\n\t" : "=a" (pkru), "=d" (edx) : "c" (ecx)); return pkru; } static inline void wrpkru(u32 pkru) { u32 ecx = 0, edx = 0; /* * "wrpkru" instruction. Loads contents in EAX to PKRU, * requires that ecx = edx = 0. */ asm volatile(".byte 0x0f,0x01,0xef\n\t" : : "a" (pkru), "c"(ecx), "d"(edx)); } #else static inline u32 rdpkru(void) { return 0; } static inline void wrpkru(u32 pkru) { } #endif static __always_inline void native_wbinvd(void) { asm volatile("wbinvd": : :"memory"); } static inline unsigned long __read_cr4(void) { return native_read_cr4(); } #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else static inline unsigned long read_cr0(void) { return native_read_cr0(); } static inline void write_cr0(unsigned long x) { native_write_cr0(x); } static __always_inline unsigned long read_cr2(void) { return native_read_cr2(); } static __always_inline void write_cr2(unsigned long x) { native_write_cr2(x); } /* * Careful! CR3 contains more than just an address. You probably want * read_cr3_pa() instead. */ static inline unsigned long __read_cr3(void) { return __native_read_cr3(); } static inline void write_cr3(unsigned long x) { native_write_cr3(x); } static inline void __write_cr4(unsigned long x) { native_write_cr4(x); } static __always_inline void wbinvd(void) { native_wbinvd(); } #endif /* CONFIG_PARAVIRT_XXL */ static __always_inline void clflush(volatile void *__p) { asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); } static inline void clflushopt(volatile void *__p) { alternative_io(".byte 0x3e; clflush %P0", ".byte 0x66; clflush %P0", X86_FEATURE_CLFLUSHOPT, "+m" (*(volatile char __force *)__p)); } static inline void clwb(volatile void *__p) { volatile struct { char x[64]; } *p = __p; asm volatile(ALTERNATIVE_2( ".byte 0x3e; clflush (%[pax])", ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */ X86_FEATURE_CLFLUSHOPT, ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */ X86_FEATURE_CLWB) : [p] "+m" (*p) : [pax] "a" (p)); } #ifdef CONFIG_X86_USER_SHADOW_STACK static inline int write_user_shstk_64(u64 __user *addr, u64 val) { asm_volatile_goto("1: wrussq %[val], (%[addr])\n" _ASM_EXTABLE(1b, %l[fail]) :: [addr] "r" (addr), [val] "r" (val) :: fail); return 0; fail: return -EFAULT; } #endif /* CONFIG_X86_USER_SHADOW_STACK */ #define nop() asm volatile ("nop") static inline void serialize(void) { /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */ asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory"); } /* The dst parameter must be 64-bytes aligned */ static inline void movdir64b(void __iomem *dst, const void *src) { const struct { char _[64]; } *__src = src; struct { char _[64]; } __iomem *__dst = dst; /* * MOVDIR64B %(rdx), rax. * * Both __src and __dst must be memory constraints in order to tell the * compiler that no other memory accesses should be reordered around * this one. * * Also, both must be supplied as lvalues because this tells * the compiler what the object is (its size) the instruction accesses. * I.e., not the pointers but what they point to, thus the deref'ing '*'. */ asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02" : "+m" (*__dst) : "m" (*__src), "a" (__dst), "d" (__src)); } /** * enqcmds - Enqueue a command in supervisor (CPL0) mode * @dst: destination, in MMIO space (must be 512-bit aligned) * @src: 512 bits memory operand * * The ENQCMDS instruction allows software to write a 512-bit command to * a 512-bit-aligned special MMIO region that supports the instruction. * A return status is loaded into the ZF flag in the RFLAGS register. * ZF = 0 equates to success, and ZF = 1 indicates retry or error. * * This function issues the ENQCMDS instruction to submit data from * kernel space to MMIO space, in a unit of 512 bits. Order of data access * is not guaranteed, nor is a memory barrier performed afterwards. It * returns 0 on success and -EAGAIN on failure. * * Warning: Do not use this helper unless your driver has checked that the * ENQCMDS instruction is supported on the platform and the device accepts * ENQCMDS. */ static inline int enqcmds(void __iomem *dst, const void *src) { const struct { char _[64]; } *__src = src; struct { char _[64]; } __iomem *__dst = dst; bool zf; /* * ENQCMDS %(rdx), rax * * See movdir64b()'s comment on operand specification. */ asm volatile(".byte 0xf3, 0x0f, 0x38, 0xf8, 0x02, 0x66, 0x90" CC_SET(z) : CC_OUT(z) (zf), "+m" (*__dst) : "m" (*__src), "a" (__dst), "d" (__src)); /* Submission failure is indicated via EFLAGS.ZF=1 */ if (zf) return -EAGAIN; return 0; } static __always_inline void tile_release(void) { /* * Instruction opcode for TILERELEASE; supported in binutils * version >= 2.36. */ asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0"); } #endif /* __KERNEL__ */ #endif /* _ASM_X86_SPECIAL_INSNS_H */
7 4 5 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 /* SPDX-License-Identifier: GPL-2.0 */ /* * This file define the new driver API for Wireless Extensions * * Version : 8 16.3.07 * * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com> * Copyright (c) 2001-2007 Jean Tourrilhes, All Rights Reserved. */ #ifndef _IW_HANDLER_H #define _IW_HANDLER_H /************************** DOCUMENTATION **************************/ /* * Initial driver API (1996 -> onward) : * ----------------------------------- * The initial API just sends the IOCTL request received from user space * to the driver (via the driver ioctl handler). The driver has to * handle all the rest... * * The initial API also defines a specific handler in struct net_device * to handle wireless statistics. * * The initial APIs served us well and has proven a reasonably good design. * However, there is a few shortcommings : * o No events, everything is a request to the driver. * o Large ioctl function in driver with gigantic switch statement * (i.e. spaghetti code). * o Driver has to mess up with copy_to/from_user, and in many cases * does it unproperly. Common mistakes are : * * buffer overflows (no checks or off by one checks) * * call copy_to/from_user with irq disabled * o The user space interface is tied to ioctl because of the use * copy_to/from_user. * * New driver API (2002 -> onward) : * ------------------------------- * The new driver API is just a bunch of standard functions (handlers), * each handling a specific Wireless Extension. The driver just export * the list of handler it supports, and those will be called apropriately. * * I tried to keep the main advantage of the previous API (simplicity, * efficiency and light weight), and also I provide a good dose of backward * compatibility (most structures are the same, driver can use both API * simultaneously, ...). * Hopefully, I've also addressed the shortcomming of the initial API. * * The advantage of the new API are : * o Handling of Extensions in driver broken in small contained functions * o Tighter checks of ioctl before calling the driver * o Flexible commit strategy (at least, the start of it) * o Backward compatibility (can be mixed with old API) * o Driver doesn't have to worry about memory and user-space issues * The last point is important for the following reasons : * o You are now able to call the new driver API from any API you * want (including from within other parts of the kernel). * o Common mistakes are avoided (buffer overflow, user space copy * with irq disabled and so on). * * The Drawback of the new API are : * o bloat (especially kernel) * o need to migrate existing drivers to new API * My initial testing shows that the new API adds around 3kB to the kernel * and save between 0 and 5kB from a typical driver. * Also, as all structures and data types are unchanged, the migration is * quite straightforward (but tedious). * * --- * * The new driver API is defined below in this file. User space should * not be aware of what's happening down there... * * A new kernel wrapper is in charge of validating the IOCTLs and calling * the appropriate driver handler. This is implemented in : * # net/core/wireless.c * * The driver export the list of handlers in : * # include/linux/netdevice.h (one place) * * The new driver API is available for WIRELESS_EXT >= 13. * Good luck with migration to the new API ;-) */ /* ---------------------- THE IMPLEMENTATION ---------------------- */ /* * Some of the choice I've made are pretty controversials. Defining an * API is very much weighting compromises. This goes into some of the * details and the thinking behind the implementation. * * Implementation goals : * -------------------- * The implementation goals were as follow : * o Obvious : you should not need a PhD to understand what's happening, * the benefit is easier maintenance. * o Flexible : it should accommodate a wide variety of driver * implementations and be as flexible as the old API. * o Lean : it should be efficient memory wise to minimise the impact * on kernel footprint. * o Transparent to user space : the large number of user space * applications that use Wireless Extensions should not need * any modifications. * * Array of functions versus Struct of functions * --------------------------------------------- * 1) Having an array of functions allow the kernel code to access the * handler in a single lookup, which is much more efficient (think hash * table here). * 2) The only drawback is that driver writer may put their handler in * the wrong slot. This is trivial to test (I set the frequency, the * bitrate changes). Once the handler is in the proper slot, it will be * there forever, because the array is only extended at the end. * 3) Backward/forward compatibility : adding new handler just require * extending the array, so you can put newer driver in older kernel * without having to patch the kernel code (and vice versa). * * All handler are of the same generic type * ---------------------------------------- * That's a feature !!! * 1) Having a generic handler allow to have generic code, which is more * efficient. If each of the handler was individually typed I would need * to add a big switch in the kernel (== more bloat). This solution is * more scalable, adding new Wireless Extensions doesn't add new code. * 2) You can use the same handler in different slots of the array. For * hardware, it may be more efficient or logical to handle multiple * Wireless Extensions with a single function, and the API allow you to * do that. (An example would be a single record on the card to control * both bitrate and frequency, the handler would read the old record, * modify it according to info->cmd and rewrite it). * * Functions prototype uses union iwreq_data * ----------------------------------------- * Some would have preferred functions defined this way : * static int mydriver_ioctl_setrate(struct net_device *dev, * long rate, int auto) * 1) The kernel code doesn't "validate" the content of iwreq_data, and * can't do it (different hardware may have different notion of what a * valid frequency is), so we don't pretend that we do it. * 2) The above form is not extendable. If I want to add a flag (for * example to distinguish setting max rate and basic rate), I would * break the prototype. Using iwreq_data is more flexible. * 3) Also, the above form is not generic (see above). * 4) I don't expect driver developper using the wrong field of the * union (Doh !), so static typechecking doesn't add much value. * 5) Lastly, you can skip the union by doing : * static int mydriver_ioctl_setrate(struct net_device *dev, * struct iw_request_info *info, * struct iw_param *rrq, * char *extra) * And then adding the handler in the array like this : * (iw_handler) mydriver_ioctl_setrate, // SIOCSIWRATE * * Using functions and not a registry * ---------------------------------- * Another implementation option would have been for every instance to * define a registry (a struct containing all the Wireless Extensions) * and only have a function to commit the registry to the hardware. * 1) This approach can be emulated by the current code, but not * vice versa. * 2) Some drivers don't keep any configuration in the driver, for them * adding such a registry would be a significant bloat. * 3) The code to translate from Wireless Extension to native format is * needed anyway, so it would not reduce significantely the amount of code. * 4) The current approach only selectively translate Wireless Extensions * to native format and only selectively set, whereas the registry approach * would require to translate all WE and set all parameters for any single * change. * 5) For many Wireless Extensions, the GET operation return the current * dynamic value, not the value that was set. * * This header is <net/iw_handler.h> * --------------------------------- * 1) This header is kernel space only and should not be exported to * user space. Headers in "include/linux/" are exported, headers in * "include/net/" are not. * * Mixed 32/64 bit issues * ---------------------- * The Wireless Extensions are designed to be 64 bit clean, by using only * datatypes with explicit storage size. * There are some issues related to kernel and user space using different * memory model, and in particular 64bit kernel with 32bit user space. * The problem is related to struct iw_point, that contains a pointer * that *may* need to be translated. * This is quite messy. The new API doesn't solve this problem (it can't), * but is a step in the right direction : * 1) Meta data about each ioctl is easily available, so we know what type * of translation is needed. * 2) The move of data between kernel and user space is only done in a single * place in the kernel, so adding specific hooks in there is possible. * 3) In the long term, it allows to move away from using ioctl as the * user space API. * * So many comments and so few code * -------------------------------- * That's a feature. Comments won't bloat the resulting kernel binary. */ /***************************** INCLUDES *****************************/ #include <linux/wireless.h> /* IOCTL user space API */ #include <linux/if_ether.h> /***************************** VERSION *****************************/ /* * This constant is used to know which version of the driver API is * available. Hopefully, this will be pretty stable and no changes * will be needed... * I just plan to increment with each new version. */ #define IW_HANDLER_VERSION 8 /* * Changes : * * V2 to V3 * -------- * - Move event definition in <linux/wireless.h> * - Add Wireless Event support : * o wireless_send_event() prototype * o iwe_stream_add_event/point() inline functions * V3 to V4 * -------- * - Reshuffle IW_HEADER_TYPE_XXX to map IW_PRIV_TYPE_XXX changes * * V4 to V5 * -------- * - Add new spy support : struct iw_spy_data & prototypes * * V5 to V6 * -------- * - Change the way we get to spy_data method for added safety * - Remove spy #ifdef, they are always on -> cleaner code * - Add IW_DESCR_FLAG_NOMAX flag for very large requests * - Start migrating get_wireless_stats to struct iw_handler_def * * V6 to V7 * -------- * - Add struct ieee80211_device pointer in struct iw_public_data * - Remove (struct iw_point *)->pointer from events and streams * - Remove spy_offset from struct iw_handler_def * - Add "check" version of event macros for ieee802.11 stack * * V7 to V8 * ---------- * - Prevent leaking of kernel space in stream on 64 bits. */ /**************************** CONSTANTS ****************************/ /* Enhanced spy support available */ #define IW_WIRELESS_SPY #define IW_WIRELESS_THRSPY /* Special error message for the driver to indicate that we * should do a commit after return from the iw_handler */ #define EIWCOMMIT EINPROGRESS /* Flags available in struct iw_request_info */ #define IW_REQUEST_FLAG_COMPAT 0x0001 /* Compat ioctl call */ /* Type of headers we know about (basically union iwreq_data) */ #define IW_HEADER_TYPE_NULL 0 /* Not available */ #define IW_HEADER_TYPE_CHAR 2 /* char [IFNAMSIZ] */ #define IW_HEADER_TYPE_UINT 4 /* __u32 */ #define IW_HEADER_TYPE_FREQ 5 /* struct iw_freq */ #define IW_HEADER_TYPE_ADDR 6 /* struct sockaddr */ #define IW_HEADER_TYPE_POINT 8 /* struct iw_point */ #define IW_HEADER_TYPE_PARAM 9 /* struct iw_param */ #define IW_HEADER_TYPE_QUAL 10 /* struct iw_quality */ /* Handling flags */ /* Most are not implemented. I just use them as a reminder of some * cool features we might need one day ;-) */ #define IW_DESCR_FLAG_NONE 0x0000 /* Obvious */ /* Wrapper level flags */ #define IW_DESCR_FLAG_DUMP 0x0001 /* Not part of the dump command */ #define IW_DESCR_FLAG_EVENT 0x0002 /* Generate an event on SET */ #define IW_DESCR_FLAG_RESTRICT 0x0004 /* GET : request is ROOT only */ /* SET : Omit payload from generated iwevent */ #define IW_DESCR_FLAG_NOMAX 0x0008 /* GET : no limit on request size */ /* Driver level flags */ #define IW_DESCR_FLAG_WAIT 0x0100 /* Wait for driver event */ /****************************** TYPES ******************************/ /* ----------------------- WIRELESS HANDLER ----------------------- */ /* * A wireless handler is just a standard function, that looks like the * ioctl handler. * We also define there how a handler list look like... As the Wireless * Extension space is quite dense, we use a simple array, which is faster * (that's the perfect hash table ;-). */ /* * Meta data about the request passed to the iw_handler. * Most handlers can safely ignore what's in there. * The 'cmd' field might come handy if you want to use the same handler * for multiple command... * This struct is also my long term insurance. I can add new fields here * without breaking the prototype of iw_handler... */ struct iw_request_info { __u16 cmd; /* Wireless Extension command */ __u16 flags; /* More to come ;-) */ }; struct net_device; /* * This is how a function handling a Wireless Extension should look * like (both get and set, standard and private). */ typedef int (*iw_handler)(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra); /* * This define all the handler that the driver export. * As you need only one per driver type, please use a static const * shared by all driver instances... Same for the members... * This will be linked from net_device in <linux/netdevice.h> */ struct iw_handler_def { /* Array of handlers for standard ioctls * We will call dev->wireless_handlers->standard[ioctl - SIOCIWFIRST] */ const iw_handler * standard; /* Number of handlers defined (more precisely, index of the * last defined handler + 1) */ __u16 num_standard; #ifdef CONFIG_WEXT_PRIV __u16 num_private; /* Number of private arg description */ __u16 num_private_args; /* Array of handlers for private ioctls * Will call dev->wireless_handlers->private[ioctl - SIOCIWFIRSTPRIV] */ const iw_handler * private; /* Arguments of private handler. This one is just a list, so you * can put it in any order you want and should not leave holes... * We will automatically export that to user space... */ const struct iw_priv_args * private_args; #endif /* New location of get_wireless_stats, to de-bloat struct net_device. * The old pointer in struct net_device will be gradually phased * out, and drivers are encouraged to use this one... */ struct iw_statistics* (*get_wireless_stats)(struct net_device *dev); }; /* ---------------------- IOCTL DESCRIPTION ---------------------- */ /* * One of the main goal of the new interface is to deal entirely with * user space/kernel space memory move. * For that, we need to know : * o if iwreq is a pointer or contain the full data * o what is the size of the data to copy * * For private IOCTLs, we use the same rules as used by iwpriv and * defined in struct iw_priv_args. * * For standard IOCTLs, things are quite different and we need to * use the structures below. Actually, this struct is also more * efficient, but that's another story... */ /* * Describe how a standard IOCTL looks like. */ struct iw_ioctl_description { __u8 header_type; /* NULL, iw_point or other */ __u8 token_type; /* Future */ __u16 token_size; /* Granularity of payload */ __u16 min_tokens; /* Min acceptable token number */ __u16 max_tokens; /* Max acceptable token number */ __u32 flags; /* Special handling of the request */ }; /* Need to think of short header translation table. Later. */ /* --------------------- ENHANCED SPY SUPPORT --------------------- */ /* * In the old days, the driver was handling spy support all by itself. * Now, the driver can delegate this task to Wireless Extensions. * It needs to include this struct in its private part and use the * standard spy iw_handler. */ /* * Instance specific spy data, i.e. addresses spied and quality for them. */ struct iw_spy_data { /* --- Standard spy support --- */ int spy_number; u_char spy_address[IW_MAX_SPY][ETH_ALEN]; struct iw_quality spy_stat[IW_MAX_SPY]; /* --- Enhanced spy support (event) */ struct iw_quality spy_thr_low; /* Low threshold */ struct iw_quality spy_thr_high; /* High threshold */ u_char spy_thr_under[IW_MAX_SPY]; }; /* --------------------- DEVICE WIRELESS DATA --------------------- */ /* * This is all the wireless data specific to a device instance that * is managed by the core of Wireless Extensions or the 802.11 layer. * We only keep pointer to those structures, so that a driver is free * to share them between instances. * This structure should be initialised before registering the device. * Access to this data follow the same rules as any other struct net_device * data (i.e. valid as long as struct net_device exist, same locking rules). */ /* Forward declaration */ struct libipw_device; /* The struct */ struct iw_public_data { /* Driver enhanced spy support */ struct iw_spy_data * spy_data; /* Legacy structure managed by the ipw2x00-specific IEEE 802.11 layer */ struct libipw_device * libipw; }; /**************************** PROTOTYPES ****************************/ /* * Functions part of the Wireless Extensions (defined in net/wireless/wext-core.c). * Those may be called by driver modules. */ /* Send a single event to user space */ void wireless_send_event(struct net_device *dev, unsigned int cmd, union iwreq_data *wrqu, const char *extra); #ifdef CONFIG_WEXT_CORE /* flush all previous wext events - if work is done from netdev notifiers */ void wireless_nlevent_flush(void); #else static inline void wireless_nlevent_flush(void) {} #endif /* We may need a function to send a stream of events to user space. * More on that later... */ /* Standard handler for SIOCSIWSPY */ int iw_handler_set_spy(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra); /* Standard handler for SIOCGIWSPY */ int iw_handler_get_spy(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra); /* Standard handler for SIOCSIWTHRSPY */ int iw_handler_set_thrspy(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra); /* Standard handler for SIOCGIWTHRSPY */ int iw_handler_get_thrspy(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra); /* Driver call to update spy records */ void wireless_spy_update(struct net_device *dev, unsigned char *address, struct iw_quality *wstats); /************************* INLINE FUNTIONS *************************/ /* * Function that are so simple that it's more efficient inlining them */ static inline int iwe_stream_lcp_len(struct iw_request_info *info) { #ifdef CONFIG_COMPAT if (info->flags & IW_REQUEST_FLAG_COMPAT) return IW_EV_COMPAT_LCP_LEN; #endif return IW_EV_LCP_LEN; } static inline int iwe_stream_point_len(struct iw_request_info *info) { #ifdef CONFIG_COMPAT if (info->flags & IW_REQUEST_FLAG_COMPAT) return IW_EV_COMPAT_POINT_LEN; #endif return IW_EV_POINT_LEN; } static inline int iwe_stream_event_len_adjust(struct iw_request_info *info, int event_len) { #ifdef CONFIG_COMPAT if (info->flags & IW_REQUEST_FLAG_COMPAT) { event_len -= IW_EV_LCP_LEN; event_len += IW_EV_COMPAT_LCP_LEN; } #endif return event_len; } /*------------------------------------------------------------------*/ /* * Wrapper to add an Wireless Event to a stream of events. */ char *iwe_stream_add_event(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, int event_len); static inline char * iwe_stream_add_event_check(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, int event_len) { char *res = iwe_stream_add_event(info, stream, ends, iwe, event_len); if (res == stream) return ERR_PTR(-E2BIG); return res; } /*------------------------------------------------------------------*/ /* * Wrapper to add an short Wireless Event containing a pointer to a * stream of events. */ char *iwe_stream_add_point(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, char *extra); static inline char * iwe_stream_add_point_check(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, char *extra) { char *res = iwe_stream_add_point(info, stream, ends, iwe, extra); if (res == stream) return ERR_PTR(-E2BIG); return res; } /*------------------------------------------------------------------*/ /* * Wrapper to add a value to a Wireless Event in a stream of events. * Be careful, this one is tricky to use properly : * At the first run, you need to have (value = event + IW_EV_LCP_LEN). */ char *iwe_stream_add_value(struct iw_request_info *info, char *event, char *value, char *ends, struct iw_event *iwe, int event_len); #endif /* _IW_HANDLER_H */
106 106 106 106 106 106 106 140 140 116 13 13 13 13 103 103 103 98 98 98 110 13 13 140 2 1 1 1 1 1 140 140 140 140 140 140 140 140 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <crypto/sha2.h> #include <net/tcp.h> #include <net/mptcp.h> #include "protocol.h" #include "mib.h" #include <trace/events/mptcp.h> static bool mptcp_cap_flag_sha256(u8 flags) { return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; } static void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, int opsize, struct mptcp_options_received *mp_opt) { u8 subtype = *ptr >> 4; int expected_opsize; u16 subopt; u8 version; u8 flags; u8 i; switch (subtype) { case MPTCPOPT_MP_CAPABLE: /* strict size checking */ if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { if (skb->len > tcp_hdr(skb)->doff << 2) expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; else expected_opsize = TCPOLEN_MPTCP_MPC_ACK; subopt = OPTION_MPTCP_MPC_ACK; } else { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) { expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; subopt = OPTION_MPTCP_MPC_SYNACK; } else { expected_opsize = TCPOLEN_MPTCP_MPC_SYN; subopt = OPTION_MPTCP_MPC_SYN; } } /* Cfr RFC 8684 Section 3.3.0: * If a checksum is present but its use had * not been negotiated in the MP_CAPABLE handshake, the receiver MUST * close the subflow with a RST, as it is not behaving as negotiated. * If a checksum is not present when its use has been negotiated, the * receiver MUST close the subflow with a RST, as it is considered * broken * We parse even option with mismatching csum presence, so that * later in subflow_data_ready we can trigger the reset. */ if (opsize != expected_opsize && (expected_opsize != TCPOLEN_MPTCP_MPC_ACK_DATA || opsize != TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM)) break; /* try to be gentle vs future versions on the initial syn */ version = *ptr++ & MPTCP_VERSION_MASK; if (opsize != TCPOLEN_MPTCP_MPC_SYN) { if (version != MPTCP_SUPPORTED_VERSION) break; } else if (version < MPTCP_SUPPORTED_VERSION) { break; } flags = *ptr++; if (!mptcp_cap_flag_sha256(flags) || (flags & MPTCP_CAP_EXTENSIBILITY)) break; /* RFC 6824, Section 3.1: * "For the Checksum Required bit (labeled "A"), if either * host requires the use of checksums, checksums MUST be used. * In other words, the only way for checksums not to be used * is if both hosts in their SYNs set A=0." */ if (flags & MPTCP_CAP_CHECKSUM_REQD) mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; mp_opt->deny_join_id0 = !!(flags & MPTCP_CAP_DENY_JOIN_ID0); mp_opt->suboptions |= subopt; if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { mp_opt->sndr_key = get_unaligned_be64(ptr); ptr += 8; } if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; } if (opsize >= TCPOLEN_MPTCP_MPC_ACK_DATA) { /* Section 3.1.: * "the data parameters in a MP_CAPABLE are semantically * equivalent to those in a DSS option and can be used * interchangeably." */ mp_opt->suboptions |= OPTION_MPTCP_DSS; mp_opt->use_map = 1; mp_opt->mpc_map = 1; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; } if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) { mp_opt->csum = get_unaligned((__force __sum16 *)ptr); mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; ptr += 2; } pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u", version, flags, opsize, mp_opt->sndr_key, mp_opt->rcvr_key, mp_opt->data_len, mp_opt->csum); break; case MPTCPOPT_MP_JOIN: mp_opt->suboptions |= OPTIONS_MPTCP_MPJ; if (opsize == TCPOLEN_MPTCP_MPJ_SYN) { mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->token = get_unaligned_be32(ptr); ptr += 4; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u", mp_opt->backup, mp_opt->join_id, mp_opt->token, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) { mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; mp_opt->thmac = get_unaligned_be64(ptr); ptr += 8; mp_opt->nonce = get_unaligned_be32(ptr); ptr += 4; pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u", mp_opt->backup, mp_opt->join_id, mp_opt->thmac, mp_opt->nonce); } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) { ptr += 2; memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); pr_debug("MP_JOIN hmac"); } else { mp_opt->suboptions &= ~OPTIONS_MPTCP_MPJ; } break; case MPTCPOPT_DSS: pr_debug("DSS"); ptr++; /* we must clear 'mpc_map' be able to detect MP_CAPABLE * map vs DSS map in mptcp_incoming_options(), and reconstruct * map info accordingly */ mp_opt->mpc_map = 0; flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d", mp_opt->data_fin, mp_opt->dsn64, mp_opt->use_map, mp_opt->ack64, mp_opt->use_ack); expected_opsize = TCPOLEN_MPTCP_DSS_BASE; if (mp_opt->use_ack) { if (mp_opt->ack64) expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; else expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; } if (mp_opt->use_map) { if (mp_opt->dsn64) expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; else expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; } /* Always parse any csum presence combination, we will enforce * RFC 8684 Section 3.3.0 checks later in subflow_data_ready */ if (opsize != expected_opsize && opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) break; mp_opt->suboptions |= OPTION_MPTCP_DSS; if (mp_opt->use_ack) { if (mp_opt->ack64) { mp_opt->data_ack = get_unaligned_be64(ptr); ptr += 8; } else { mp_opt->data_ack = get_unaligned_be32(ptr); ptr += 4; } pr_debug("data_ack=%llu", mp_opt->data_ack); } if (mp_opt->use_map) { if (mp_opt->dsn64) { mp_opt->data_seq = get_unaligned_be64(ptr); ptr += 8; } else { mp_opt->data_seq = get_unaligned_be32(ptr); ptr += 4; } mp_opt->subflow_seq = get_unaligned_be32(ptr); ptr += 4; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) { mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; mp_opt->csum = get_unaligned((__force __sum16 *)ptr); ptr += 2; } pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", mp_opt->data_seq, mp_opt->subflow_seq, mp_opt->data_len, !!(mp_opt->suboptions & OPTION_MPTCP_CSUMREQD), mp_opt->csum); } break; case MPTCPOPT_ADD_ADDR: mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO; if (!mp_opt->echo) { if (opsize == TCPOLEN_MPTCP_ADD_ADDR || opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT) mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 || opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT) mp_opt->addr.family = AF_INET6; #endif else break; } else { if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) mp_opt->addr.family = AF_INET; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) mp_opt->addr.family = AF_INET6; #endif else break; } mp_opt->suboptions |= OPTION_MPTCP_ADD_ADDR; mp_opt->addr.id = *ptr++; mp_opt->addr.port = 0; mp_opt->ahmac = 0; if (mp_opt->addr.family == AF_INET) { memcpy((u8 *)&mp_opt->addr.addr.s_addr, (u8 *)ptr, 4); ptr += 4; if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) { mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else { memcpy(mp_opt->addr.addr6.s6_addr, (u8 *)ptr, 16); ptr += 16; if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT || opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) { mp_opt->addr.port = htons(get_unaligned_be16(ptr)); ptr += 2; } } #endif if (!mp_opt->echo) { mp_opt->ahmac = get_unaligned_be64(ptr); ptr += 8; } pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d", (mp_opt->addr.family == AF_INET6) ? "6" : "", mp_opt->addr.id, mp_opt->ahmac, mp_opt->echo, ntohs(mp_opt->addr.port)); break; case MPTCPOPT_RM_ADDR: if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 || opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX) break; ptr++; mp_opt->suboptions |= OPTION_MPTCP_RM_ADDR; mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE; for (i = 0; i < mp_opt->rm_list.nr; i++) mp_opt->rm_list.ids[i] = *ptr++; pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr); break; case MPTCPOPT_MP_PRIO: if (opsize != TCPOLEN_MPTCP_PRIO) break; mp_opt->suboptions |= OPTION_MPTCP_PRIO; mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP; pr_debug("MP_PRIO: prio=%d", mp_opt->backup); break; case MPTCPOPT_MP_FASTCLOSE: if (opsize != TCPOLEN_MPTCP_FASTCLOSE) break; ptr += 2; mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE; pr_debug("MP_FASTCLOSE: recv_key=%llu", mp_opt->rcvr_key); break; case MPTCPOPT_RST: if (opsize != TCPOLEN_MPTCP_RST) break; if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) break; mp_opt->suboptions |= OPTION_MPTCP_RST; flags = *ptr++; mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; mp_opt->reset_reason = *ptr; pr_debug("MP_RST: transient=%u reason=%u", mp_opt->reset_transient, mp_opt->reset_reason); break; case MPTCPOPT_MP_FAIL: if (opsize != TCPOLEN_MPTCP_FAIL) break; ptr += 2; mp_opt->suboptions |= OPTION_MPTCP_FAIL; mp_opt->fail_seq = get_unaligned_be64(ptr); pr_debug("MP_FAIL: data_seq=%llu", mp_opt->fail_seq); break; default: break; } } void mptcp_get_options(const struct sk_buff *skb, struct mptcp_options_received *mp_opt) { const struct tcphdr *th = tcp_hdr(skb); const unsigned char *ptr; int length; /* initialize option status */ mp_opt->suboptions = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); while (length > 0) { int opcode = *ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: if (length < 2) return; opsize = *ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) return; /* don't parse partial options */ if (opcode == TCPOPT_MPTCP) mptcp_parse_option(skb, ptr, opsize, mp_opt); ptr += opsize - 2; length -= opsize; } } } bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* we will use snd_isn to detect first pkt [re]transmission * in mptcp_established_options_mp() */ subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; if (subflow->request_mptcp) { opts->suboptions = OPTION_MPTCP_MPC_SYN; opts->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk)); opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); *size = TCPOLEN_MPTCP_MPC_SYN; return true; } else if (subflow->request_join) { pr_debug("remote_token=%u, nonce=%u", subflow->remote_token, subflow->local_nonce); opts->suboptions = OPTION_MPTCP_MPJ_SYN; opts->join_id = subflow->local_id; opts->token = subflow->remote_token; opts->nonce = subflow->local_nonce; opts->backup = subflow->request_bkup; *size = TCPOLEN_MPTCP_MPJ_SYN; return true; } return false; } static void clear_3rdack_retransmission(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); sk_stop_timer(sk, &icsk->icsk_delack_timer); icsk->icsk_ack.timeout = 0; icsk->icsk_ack.ato = 0; icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER); } static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_ext *mpext; unsigned int data_len; u8 len; /* When skb is not available, we better over-estimate the emitted * options len. A full DSS option (28 bytes) is longer than * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so * tell the caller to defer the estimate to * mptcp_established_options_dss(), which will reserve enough space. */ if (!skb) return false; /* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */ if (subflow->fully_established || snd_data_fin_enable || subflow->snd_isn != TCP_SKB_CB(skb)->seq || sk->sk_state != TCP_ESTABLISHED) return false; if (subflow->mp_capable) { mpext = mptcp_get_ext(skb); data_len = mpext ? mpext->data_len : 0; /* we will check ops->data_len in mptcp_write_options() to * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and * TCPOLEN_MPTCP_MPC_ACK */ opts->data_len = data_len; opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; opts->csum_reqd = READ_ONCE(msk->csum_enabled); opts->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk)); /* Section 3.1. * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK * packets that start the first subflow of an MPTCP connection, * as well as the first packet that carries data */ if (data_len > 0) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) { /* we need to propagate more info to csum the pseudo hdr */ opts->data_seq = mpext->data_seq; opts->subflow_seq = mpext->subflow_seq; opts->csum = mpext->csum; len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *size = ALIGN(len, 4); } else { *size = TCPOLEN_MPTCP_MPC_ACK; } pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", subflow, subflow->local_key, subflow->remote_key, data_len); return true; } else if (subflow->mp_join) { opts->suboptions = OPTION_MPTCP_MPJ_ACK; memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN); *size = TCPOLEN_MPTCP_MPJ_ACK; pr_debug("subflow=%p", subflow); /* we can use the full delegate action helper only from BH context * If we are in process context - sk is flushing the backlog at * socket lock release time - just set the appropriate flag, will * be handled by the release callback */ if (sock_owned_by_user(sk)) set_bit(MPTCP_DELEGATE_ACK, &subflow->delegated_status); else mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_ACK); return true; } return false; } static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_ext *ext) { /* The write_seq value has already been incremented, so the actual * sequence number for the DATA_FIN is one less. */ u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq) - 1; if (!ext->use_map || !skb->len) { /* RFC6824 requires a DSS mapping with specific values * if DATA_FIN is set but no data payload is mapped */ ext->data_fin = 1; ext->use_map = 1; ext->dsn64 = 1; ext->data_seq = data_fin_tx_seq; ext->subflow_seq = 0; ext->data_len = 1; } else if (ext->data_seq + ext->data_len == data_fin_tx_seq) { /* If there's an existing DSS mapping and it is the * final mapping, DATA_FIN consumes 1 additional byte of * mapping space. */ ext->data_fin = 1; ext->data_len++; } } static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int dss_size = 0; struct mptcp_ext *mpext; unsigned int ack_size; bool ret = false; u64 ack_seq; opts->csum_reqd = READ_ONCE(msk->csum_enabled); mpext = skb ? mptcp_get_ext(skb) : NULL; if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) { unsigned int map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; if (mpext) { if (opts->csum_reqd) map_size += TCPOLEN_MPTCP_DSS_CHECKSUM; opts->ext_copy = *mpext; } dss_size = map_size; if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); opts->suboptions = OPTION_MPTCP_DSS; ret = true; } /* passive sockets msk will set the 'can_ack' after accept(), even * if the first subflow may have the already the remote key handy */ opts->ext_copy.use_ack = 0; if (!READ_ONCE(msk->can_ack)) { *size = ALIGN(dss_size, 4); return ret; } ack_seq = READ_ONCE(msk->ack_seq); if (READ_ONCE(msk->use_64bit_ack)) { ack_size = TCPOLEN_MPTCP_DSS_ACK64; opts->ext_copy.data_ack = ack_seq; opts->ext_copy.ack64 = 1; } else { ack_size = TCPOLEN_MPTCP_DSS_ACK32; opts->ext_copy.data_ack32 = (uint32_t)ack_seq; opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; opts->suboptions = OPTION_MPTCP_DSS; WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk)); /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) ack_size += TCPOLEN_MPTCP_DSS_BASE; dss_size += ack_size; *size = ALIGN(dss_size, 4); return true; } static u64 add_addr_generate_hmac(u64 key1, u64 key2, struct mptcp_addr_info *addr) { u16 port = ntohs(addr->port); u8 hmac[SHA256_DIGEST_SIZE]; u8 msg[19]; int i = 0; msg[i++] = addr->id; if (addr->family == AF_INET) { memcpy(&msg[i], &addr->addr.s_addr, 4); i += 4; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6) { memcpy(&msg[i], &addr->addr6.s6_addr, 16); i += 16; } #endif msg[i++] = port >> 8; msg[i++] = port & 0xFF; mptcp_crypto_hmac_sha(key1, key2, msg, i, hmac); return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]); } static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); bool drop_other_suboptions = false; unsigned int opt_size = *size; bool echo; int len; /* add addr will strip the existing options, be sure to avoid breaking * MPC/MPJ handshakes */ if (!mptcp_pm_should_add_signal(msk) || (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr, &echo, &drop_other_suboptions)) return false; if (drop_other_suboptions) remaining += opt_size; len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); if (remaining < len) return false; *size = len; if (drop_other_suboptions) { pr_debug("drop other suboptions"); opts->suboptions = 0; /* note that e.g. DSS could have written into the memory * aliased by ahmac, we must reset the field here * to avoid appending the hmac even for ADD_ADDR echo * options */ opts->ahmac = 0; *size -= opt_size; } opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX); opts->ahmac = add_addr_generate_hmac(msk->local_key, msk->remote_key, &opts->addr); } else { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX); } pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d", opts->addr.id, opts->ahmac, echo, ntohs(opts->addr.port)); return true; } static bool mptcp_established_options_rm_addr(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_rm_list rm_list; int i, len; if (!mptcp_pm_should_rm_signal(msk) || !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list))) return false; len = mptcp_rm_addr_len(&rm_list); if (len < 0) return false; if (remaining < len) return false; *size = len; opts->suboptions |= OPTION_MPTCP_RM_ADDR; opts->rm_list = rm_list; for (i = 0; i < opts->rm_list.nr; i++) pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]); MPTCP_ADD_STATS(sock_net(sk), MPTCP_MIB_RMADDRTX, opts->rm_list.nr); return true; } static bool mptcp_established_options_mp_prio(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); /* can't send MP_PRIO with MPC, as they share the same option space: * 'backup'. Also it makes no sense at all */ if (!subflow->send_mp_prio || (opts->suboptions & OPTIONS_MPTCP_MPC)) return false; /* account for the trailing 'nop' option */ if (remaining < TCPOLEN_MPTCP_PRIO_ALIGN) return false; *size = TCPOLEN_MPTCP_PRIO_ALIGN; opts->suboptions |= OPTION_MPTCP_PRIO; opts->backup = subflow->request_bkup; pr_debug("prio=%d", opts->backup); return true; } static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); if (remaining < TCPOLEN_MPTCP_RST) return false; *size = TCPOLEN_MPTCP_RST; opts->suboptions |= OPTION_MPTCP_RST; opts->reset_transient = subflow->reset_transient; opts->reset_reason = subflow->reset_reason; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTTX); return true; } static bool mptcp_established_options_fastclose(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); if (likely(!subflow->send_fastclose)) return false; if (remaining < TCPOLEN_MPTCP_FASTCLOSE) return false; *size = TCPOLEN_MPTCP_FASTCLOSE; opts->suboptions |= OPTION_MPTCP_FASTCLOSE; opts->rcvr_key = msk->remote_key; pr_debug("FASTCLOSE key=%llu", opts->rcvr_key); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); return true; } static bool mptcp_established_options_mp_fail(struct sock *sk, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); if (likely(!subflow->send_mp_fail)) return false; if (remaining < TCPOLEN_MPTCP_FAIL) return false; *size = TCPOLEN_MPTCP_FAIL; opts->suboptions |= OPTION_MPTCP_FAIL; opts->fail_seq = subflow->map_seq; pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILTX); return true; } bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); unsigned int opt_size = 0; bool snd_data_fin; bool ret = false; opts->suboptions = 0; if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb))) return false; if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) || mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; } /* MP_RST can be used with MP_FASTCLOSE and MP_FAIL if there is room */ if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; } return true; } snd_data_fin = mptcp_data_fin_enabled(msk); if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, opts)) { unsigned int mp_fail_size; ret = true; if (mptcp_established_options_mp_fail(sk, &mp_fail_size, remaining - opt_size, opts)) { *size += opt_size + mp_fail_size; remaining -= opt_size - mp_fail_size; return true; } } /* we reserved enough space for the above options, and exceeding the * TCP option space would be fatal */ if (WARN_ON_ONCE(opt_size > remaining)) return false; *size += opt_size; remaining -= opt_size; if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } else if (mptcp_established_options_rm_addr(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } if (mptcp_established_options_mp_prio(sk, &opt_size, remaining, opts)) { *size += opt_size; remaining -= opt_size; ret = true; } return ret; } bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); if (subflow_req->mp_capable) { opts->suboptions = OPTION_MPTCP_MPC_SYNACK; opts->sndr_key = subflow_req->local_key; opts->csum_reqd = subflow_req->csum_reqd; opts->allow_join_id0 = subflow_req->allow_join_id0; *size = TCPOLEN_MPTCP_MPC_SYNACK; pr_debug("subflow_req=%p, local_key=%llu", subflow_req, subflow_req->local_key); return true; } else if (subflow_req->mp_join) { opts->suboptions = OPTION_MPTCP_MPJ_SYNACK; opts->backup = subflow_req->backup; opts->join_id = subflow_req->local_id; opts->thmac = subflow_req->thmac; opts->nonce = subflow_req->local_nonce; pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u", subflow_req, opts->backup, opts->join_id, opts->thmac, opts->nonce); *size = TCPOLEN_MPTCP_MPJ_SYNACK; return true; } return false; } static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_options_received *mp_opt) { /* here we can process OoO, in-window pkts, only in-sequence 4th ack * will make the subflow fully established */ if (likely(subflow->fully_established)) { /* on passive sockets, check for 3rd ack retransmission * note that msk is always set by subflow_syn_recv_sock() * for mp_join subflows */ if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) && !subflow->request_join) tcp_send_ack(ssk); goto check_notify; } /* we must process OoO packets before the first subflow is fully * established. OoO packets are instead a protocol violation * for MP_JOIN subflows as the peer must not send any data * before receiving the forth ack - cfr. RFC 8684 section 3.2. */ if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) { if (subflow->mp_join) goto reset; if (subflow->is_mptfo && mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) goto set_fully_established; return subflow->mp_capable; } if (subflow->remote_key_valid && (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) || ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo))) { /* subflows are fully established as soon as we get any * additional ack, including ADD_ADDR. */ subflow->fully_established = 1; WRITE_ONCE(msk->fully_established, true); goto check_notify; } /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP. Fallback scenarios requires a reset for * MP_JOIN subflows. */ if (!(mp_opt->suboptions & OPTIONS_MPTCP_MPC)) { if (subflow->mp_join) goto reset; subflow->mp_capable = 0; pr_fallback(msk); mptcp_do_fallback(ssk); return false; } if (mp_opt->deny_join_id0) WRITE_ONCE(msk->pm.remote_deny_join_id0, true); set_fully_established: if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); mptcp_subflow_fully_established(subflow, mp_opt); check_notify: /* if the subflow is not already linked into the conn_list, we can't * notify the PM: this subflow is still on the listener queue * and the PM possibly acquiring the subflow lock could race with * the listener close */ if (likely(subflow->pm_notified) || list_empty(&subflow->node)) return true; subflow->pm_notified = 1; if (subflow->mp_join) { clear_3rdack_retransmission(ssk); mptcp_pm_subflow_established(msk); } else { mptcp_pm_fully_established(msk, ssk); } return true; reset: mptcp_subflow_reset(ssk); return false; } u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) { u32 old_seq32, cur_seq32; old_seq32 = (u32)old_seq; cur_seq32 = (u32)cur_seq; cur_seq = (old_seq & GENMASK_ULL(63, 32)) + cur_seq32; if (unlikely(cur_seq32 < old_seq32 && before(old_seq32, cur_seq32))) return cur_seq + (1LL << 32); /* reverse wrap could happen, too */ if (unlikely(cur_seq32 > old_seq32 && after(old_seq32, cur_seq32))) return cur_seq - (1LL << 32); return cur_seq; } static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una) { msk->bytes_acked += new_snd_una - msk->snd_una; msk->snd_una = new_snd_una; } static void ack_update_msk(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_options_received *mp_opt) { u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt); struct sock *sk = (struct sock *)msk; u64 old_snd_una; mptcp_data_lock(sk); /* avoid ack expansion on update conflict, to reduce the risk of * wrongly expanding to a future ack sequence number, which is way * more dangerous than missing an ack */ old_snd_una = msk->snd_una; new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64); /* ACK for data not even sent yet? Ignore.*/ if (unlikely(after64(new_snd_una, snd_nxt))) new_snd_una = old_snd_una; new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; if (after64(new_wnd_end, msk->wnd_end)) msk->wnd_end = new_wnd_end; /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */ if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt))) __mptcp_check_push(sk, ssk); if (after64(new_snd_una, old_snd_una)) { __mptcp_snd_una_update(msk, new_snd_una); __mptcp_data_acked(sk); } mptcp_data_unlock(sk); trace_ack_update_msk(mp_opt->data_ack, old_snd_una, new_snd_una, new_wnd_end, msk->wnd_end); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) { /* Skip if DATA_FIN was already received. * If updating simultaneously with the recvmsg loop, values * should match. If they mismatch, the peer is misbehaving and * we will prefer the most recent information. */ if (READ_ONCE(msk->rcv_data_fin)) return false; WRITE_ONCE(msk->rcv_data_fin_seq, mptcp_expand_seq(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit)); WRITE_ONCE(msk->rcv_data_fin, 1); return true; } static bool add_addr_hmac_valid(struct mptcp_sock *msk, struct mptcp_options_received *mp_opt) { u64 hmac = 0; if (mp_opt->echo) return true; hmac = add_addr_generate_hmac(msk->remote_key, msk->local_key, &mp_opt->addr); pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", msk, hmac, mp_opt->ahmac); return hmac == mp_opt->ahmac; } /* Return false if a subflow has been reset, else return true */ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_options_received mp_opt; struct mptcp_ext *mpext; if (__mptcp_check_fallback(msk)) { /* Keep it simple and unconditionally trigger send data cleanup and * pending queue spooling. We will need to acquire the data lock * for more accurate checks, and once the lock is acquired, such * helpers are cheap. */ mptcp_data_lock(subflow->conn); if (sk_stream_memory_free(sk)) __mptcp_check_push(subflow->conn, sk); /* on fallback we just need to ignore the msk-level snd_una, as * this is really plain TCP */ __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt)); __mptcp_data_acked(subflow->conn); mptcp_data_unlock(subflow->conn); return true; } mptcp_get_options(skb, &mp_opt); /* The subflow can be in close state only if check_fully_established() * just sent a reset. If so, tell the caller to ignore the current packet. */ if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return sk->sk_state != TCP_CLOSE; if (unlikely(mp_opt.suboptions != OPTION_MPTCP_DSS)) { if ((mp_opt.suboptions & OPTION_MPTCP_FASTCLOSE) && msk->local_key == mp_opt.rcvr_key) { WRITE_ONCE(msk->rcv_fastclose, true); mptcp_schedule_work((struct sock *)msk); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX); } if ((mp_opt.suboptions & OPTION_MPTCP_ADD_ADDR) && add_addr_hmac_valid(msk, &mp_opt)) { if (!mp_opt.echo) { mptcp_pm_add_addr_received(sk, &mp_opt.addr); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR); } else { mptcp_pm_add_addr_echoed(msk, &mp_opt.addr); mptcp_pm_del_add_timer(msk, &mp_opt.addr, true); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD); } if (mp_opt.addr.port) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD); } if (mp_opt.suboptions & OPTION_MPTCP_RM_ADDR) mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list); if (mp_opt.suboptions & OPTION_MPTCP_PRIO) { mptcp_pm_mp_prio_received(sk, mp_opt.backup); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX); } if (mp_opt.suboptions & OPTION_MPTCP_FAIL) { mptcp_pm_mp_fail_received(sk, mp_opt.fail_seq); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILRX); } if (mp_opt.suboptions & OPTION_MPTCP_RST) { subflow->reset_seen = 1; subflow->reset_reason = mp_opt.reset_reason; subflow->reset_transient = mp_opt.reset_transient; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPRSTRX); } if (!(mp_opt.suboptions & OPTION_MPTCP_DSS)) return true; } /* we can't wait for recvmsg() to update the ack_seq, otherwise * monodirectional flows will stuck */ if (mp_opt.use_ack) ack_update_msk(msk, sk, &mp_opt); /* Zero-data-length packets are dropped by the caller and not * propagated to the MPTCP layer, so the skb extension does not * need to be allocated or populated. DATA_FIN information, if * present, needs to be updated here before the skb is freed. */ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { if (mp_opt.data_fin && mp_opt.data_len == 1 && mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64)) mptcp_schedule_work((struct sock *)msk); return true; } mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) return true; memset(mpext, 0, sizeof(*mpext)); if (likely(mp_opt.use_map)) { if (mp_opt.mpc_map) { /* this is an MP_CAPABLE carrying MPTCP data * we know this map the first chunk of data */ mptcp_crypto_key_sha(subflow->remote_key, NULL, &mpext->data_seq); mpext->data_seq++; mpext->subflow_seq = 1; mpext->dsn64 = 1; mpext->mpc_map = 1; mpext->data_fin = 0; } else { mpext->data_seq = mp_opt.data_seq; mpext->subflow_seq = mp_opt.subflow_seq; mpext->dsn64 = mp_opt.dsn64; mpext->data_fin = mp_opt.data_fin; } mpext->data_len = mp_opt.data_len; mpext->use_map = 1; mpext->csum_reqd = !!(mp_opt.suboptions & OPTION_MPTCP_CSUMREQD); if (mpext->csum_reqd) mpext->csum = mp_opt.csum; } return true; } static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) { const struct sock *ssk = (const struct sock *)tp; struct mptcp_subflow_context *subflow; u64 ack_seq, rcv_wnd_old, rcv_wnd_new; struct mptcp_sock *msk; u32 new_win; u64 win; subflow = mptcp_subflow_ctx(ssk); msk = mptcp_sk(subflow->conn); ack_seq = READ_ONCE(msk->ack_seq); rcv_wnd_new = ack_seq + tp->rcv_wnd; rcv_wnd_old = atomic64_read(&msk->rcv_wnd_sent); if (after64(rcv_wnd_new, rcv_wnd_old)) { u64 rcv_wnd; for (;;) { rcv_wnd = atomic64_cmpxchg(&msk->rcv_wnd_sent, rcv_wnd_old, rcv_wnd_new); if (rcv_wnd == rcv_wnd_old) break; rcv_wnd_old = rcv_wnd; if (before64(rcv_wnd_new, rcv_wnd_old)) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICTUPDATE); goto raise_win; } MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDCONFLICT); } return; } if (rcv_wnd_new != rcv_wnd_old) { raise_win: win = rcv_wnd_old - ack_seq; tp->rcv_wnd = min_t(u64, win, U32_MAX); new_win = tp->rcv_wnd; /* Make sure we do not exceed the maximum possible * scaled window. */ if (unlikely(th->syn)) new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale; if (!tp->rx_opt.rcv_wscale && READ_ONCE(sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows)) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; th->window = htons(new_win); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_RCVWNDSHARED); } } __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) { struct csum_pseudo_header header; __wsum csum; /* cfr RFC 8684 3.3.1.: * the data sequence number used in the pseudo-header is * always the 64-bit value, irrespective of what length is used in the * DSS option itself. */ header.data_seq = cpu_to_be64(data_seq); header.subflow_seq = htonl(subflow_seq); header.data_len = htons(data_len); header.csum = 0; csum = csum_partial(&header, sizeof(header), sum); return csum_fold(csum); } static __sum16 mptcp_make_csum(const struct mptcp_ext *mpext) { return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len, ~csum_unfold(mpext->csum)); } static void put_len_csum(u16 len, __sum16 csum, void *data) { __sum16 *sumptr = data + 2; __be16 *ptr = data; put_unaligned_be16(len, ptr); put_unaligned(csum, sumptr); } void mptcp_write_options(struct tcphdr *th, __be32 *ptr, struct tcp_sock *tp, struct mptcp_out_options *opts) { const struct sock *ssk = (const struct sock *)tp; struct mptcp_subflow_context *subflow; /* Which options can be used together? * * X: mutually exclusive * O: often used together * C: can be used together in some cases * P: could be used together but we prefer not to (optimisations) * * Opt: | MPC | MPJ | DSS | ADD | RM | PRIO | FAIL | FC | * ------|------|------|------|------|------|------|------|------| * MPC |------|------|------|------|------|------|------|------| * MPJ | X |------|------|------|------|------|------|------| * DSS | X | X |------|------|------|------|------|------| * ADD | X | X | P |------|------|------|------|------| * RM | C | C | C | P |------|------|------|------| * PRIO | X | C | C | C | C |------|------|------| * FAIL | X | X | C | X | X | X |------|------| * FC | X | X | X | X | X | X | X |------| * RST | X | X | X | X | X | X | O | O | * ------|------|------|------|------|------|------|------|------| * * The same applies in mptcp_established_options() function. */ if (likely(OPTION_MPTCP_DSS & opts->suboptions)) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; u8 flags = 0; if (mpext->use_ack) { flags = MPTCP_DSS_HAS_ACK; if (mpext->ack64) { len += TCPOLEN_MPTCP_DSS_ACK64; flags |= MPTCP_DSS_ACK64; } else { len += TCPOLEN_MPTCP_DSS_ACK32; } } if (mpext->use_map) { len += TCPOLEN_MPTCP_DSS_MAP64; /* Use only 64-bit mapping flags for now, add * support for optional 32-bit mappings later. */ flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; if (mpext->data_fin) flags |= MPTCP_DSS_DATA_FIN; if (opts->csum_reqd) len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); if (mpext->use_ack) { if (mpext->ack64) { put_unaligned_be64(mpext->data_ack, ptr); ptr += 2; } else { put_unaligned_be32(mpext->data_ack32, ptr); ptr += 1; } } if (mpext->use_map) { put_unaligned_be64(mpext->data_seq, ptr); ptr += 2; put_unaligned_be32(mpext->subflow_seq, ptr); ptr += 1; if (opts->csum_reqd) { /* data_len == 0 is reserved for the infinite mapping, * the checksum will also be set to 0. */ put_len_csum(mpext->data_len, (mpext->data_len ? mptcp_make_csum(mpext) : 0), ptr); } else { put_unaligned_be32(mpext->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } ptr += 1; } /* We might need to add MP_FAIL options in rare cases */ if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) goto mp_fail; } else if (OPTIONS_MPTCP_MPC & opts->suboptions) { u8 len, flag = MPTCP_CAP_HMAC_SHA256; if (OPTION_MPTCP_MPC_SYN & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYN; } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYNACK; } else if (opts->data_len) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) len += TCPOLEN_MPTCP_DSS_CHECKSUM; } else { len = TCPOLEN_MPTCP_MPC_ACK; } if (opts->csum_reqd) flag |= MPTCP_CAP_CHECKSUM_REQD; if (!opts->allow_join_id0) flag |= MPTCP_CAP_DENY_JOIN_ID0; *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, MPTCP_SUPPORTED_VERSION, flag); if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions)) goto mp_capable_done; put_unaligned_be64(opts->sndr_key, ptr); ptr += 2; if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) goto mp_capable_done; put_unaligned_be64(opts->rcvr_key, ptr); ptr += 2; if (!opts->data_len) goto mp_capable_done; if (opts->csum_reqd) { put_len_csum(opts->data_len, __mptcp_make_csum(opts->data_seq, opts->subflow_seq, opts->data_len, ~csum_unfold(opts->csum)), ptr); } else { put_unaligned_be32(opts->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } ptr += 1; /* MPC is additionally mutually exclusive with MP_PRIO */ goto mp_capable_done; } else if (OPTIONS_MPTCP_MPJ & opts->suboptions) { if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_SYN, opts->backup, opts->join_id); put_unaligned_be32(opts->token, ptr); ptr += 1; put_unaligned_be32(opts->nonce, ptr); ptr += 1; } else if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_SYNACK, opts->backup, opts->join_id); put_unaligned_be64(opts->thmac, ptr); ptr += 2; put_unaligned_be32(opts->nonce, ptr); ptr += 1; } else { *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, TCPOLEN_MPTCP_MPJ_ACK, 0, 0); memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); ptr += 5; } } else if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; u8 echo = MPTCP_ADDR_ECHO; #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (opts->addr.family == AF_INET6) len = TCPOLEN_MPTCP_ADD_ADDR6_BASE; #endif if (opts->addr.port) len += TCPOLEN_MPTCP_PORT_LEN; if (opts->ahmac) { len += sizeof(opts->ahmac); echo = 0; } *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, len, echo, opts->addr.id); if (opts->addr.family == AF_INET) { memcpy((u8 *)ptr, (u8 *)&opts->addr.addr.s_addr, 4); ptr += 1; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (opts->addr.family == AF_INET6) { memcpy((u8 *)ptr, opts->addr.addr6.s6_addr, 16); ptr += 4; } #endif if (!opts->addr.port) { if (opts->ahmac) { put_unaligned_be64(opts->ahmac, ptr); ptr += 2; } } else { u16 port = ntohs(opts->addr.port); if (opts->ahmac) { u8 *bptr = (u8 *)ptr; put_unaligned_be16(port, bptr); bptr += 2; put_unaligned_be64(opts->ahmac, bptr); bptr += 8; put_unaligned_be16(TCPOPT_NOP << 8 | TCPOPT_NOP, bptr); ptr += 3; } else { put_unaligned_be32(port << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); ptr += 1; } } } else if (unlikely(OPTION_MPTCP_FASTCLOSE & opts->suboptions)) { /* FASTCLOSE is mutually exclusive with others except RST */ *ptr++ = mptcp_option(MPTCPOPT_MP_FASTCLOSE, TCPOLEN_MPTCP_FASTCLOSE, 0, 0); put_unaligned_be64(opts->rcvr_key, ptr); ptr += 2; if (OPTION_MPTCP_RST & opts->suboptions) goto mp_rst; return; } else if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) { mp_fail: /* MP_FAIL is mutually exclusive with others except RST */ subflow = mptcp_subflow_ctx(ssk); subflow->send_mp_fail = 0; *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL, TCPOLEN_MPTCP_FAIL, 0, 0); put_unaligned_be64(opts->fail_seq, ptr); ptr += 2; if (OPTION_MPTCP_RST & opts->suboptions) goto mp_rst; return; } else if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) { mp_rst: *ptr++ = mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, opts->reset_transient, opts->reset_reason); return; } if (OPTION_MPTCP_PRIO & opts->suboptions) { subflow = mptcp_subflow_ctx(ssk); subflow->send_mp_prio = 0; *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO, TCPOLEN_MPTCP_PRIO, opts->backup, TCPOPT_NOP); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPPRIOTX); } mp_capable_done: if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { u8 i = 1; *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr, 0, opts->rm_list.ids[0]); while (i < opts->rm_list.nr) { u8 id1, id2, id3, id4; id1 = opts->rm_list.ids[i]; id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP; id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP; id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP; put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr); ptr += 1; i += 4; } } if (tp) mptcp_set_rwin(tp, th); } __be32 mptcp_get_reset_option(const struct sk_buff *skb) { const struct mptcp_ext *ext = mptcp_get_ext(skb); u8 flags, reason; if (ext) { flags = ext->reset_transient; reason = ext->reset_reason; return mptcp_option(MPTCPOPT_RST, TCPOLEN_MPTCP_RST, flags, reason); } return htonl(0u); } EXPORT_SYMBOL_GPL(mptcp_get_reset_option);
1332 5027 5028 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_RT_H #define _LINUX_SCHED_RT_H #include <linux/sched.h> struct task_struct; static inline int rt_prio(int prio) { if (unlikely(prio < MAX_RT_PRIO)) return 1; return 0; } static inline int rt_task(struct task_struct *p) { return rt_prio(p->prio); } static inline bool task_is_realtime(struct task_struct *tsk) { int policy = tsk->policy; if (policy == SCHED_FIFO || policy == SCHED_RR) return true; if (policy == SCHED_DEADLINE) return true; return false; } #ifdef CONFIG_RT_MUTEXES /* * Must hold either p->pi_lock or task_rq(p)->lock. */ static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p) { return p->pi_top_task; } extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task); extern void rt_mutex_adjust_pi(struct task_struct *p); #else static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) { return NULL; } # define rt_mutex_adjust_pi(p) do { } while (0) #endif extern void normalize_rt_tasks(void); /* * default timeslice is 100 msecs (used only for SCHED_RR tasks). * Timeslices get refilled after they expire. */ #define RR_TIMESLICE (100 * HZ / 1000) #endif /* _LINUX_SCHED_RT_H */
8 19 3751 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 /* SPDX-License-Identifier: GPL-2.0 */ /* * fscrypt_private.h * * Copyright (C) 2015, Google, Inc. * * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. * Heavily modified since then. */ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H #include <linux/fscrypt.h> #include <linux/siphash.h> #include <crypto/hash.h> #include <linux/blk-crypto.h> #define CONST_STRLEN(str) (sizeof(str) - 1) #define FSCRYPT_FILE_NONCE_SIZE 16 /* * Minimum size of an fscrypt master key. Note: a longer key will be required * if ciphers with a 256-bit security strength are used. This is just the * absolute minimum, which applies when only 128-bit encryption is used. */ #define FSCRYPT_MIN_KEY_SIZE 16 #define FSCRYPT_CONTEXT_V1 1 #define FSCRYPT_CONTEXT_V2 2 /* Keep this in sync with include/uapi/linux/fscrypt.h */ #define FSCRYPT_MODE_MAX FSCRYPT_MODE_AES_256_HCTR2 struct fscrypt_context_v1 { u8 version; /* FSCRYPT_CONTEXT_V1 */ u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; struct fscrypt_context_v2 { u8 version; /* FSCRYPT_CONTEXT_V2 */ u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; u8 __reserved[4]; u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; /* * fscrypt_context - the encryption context of an inode * * This is the on-disk equivalent of an fscrypt_policy, stored alongside each * encrypted file usually in a hidden extended attribute. It contains the * fields from the fscrypt_policy, in order to identify the encryption algorithm * and key with which the file is encrypted. It also contains a nonce that was * randomly generated by fscrypt itself; this is used as KDF input or as a tweak * to cause different files to be encrypted differently. */ union fscrypt_context { u8 version; struct fscrypt_context_v1 v1; struct fscrypt_context_v2 v2; }; /* * Return the size expected for the given fscrypt_context based on its version * number, or 0 if the context version is unrecognized. */ static inline int fscrypt_context_size(const union fscrypt_context *ctx) { switch (ctx->version) { case FSCRYPT_CONTEXT_V1: BUILD_BUG_ON(sizeof(ctx->v1) != 28); return sizeof(ctx->v1); case FSCRYPT_CONTEXT_V2: BUILD_BUG_ON(sizeof(ctx->v2) != 40); return sizeof(ctx->v2); } return 0; } /* Check whether an fscrypt_context has a recognized version number and size */ static inline bool fscrypt_context_is_valid(const union fscrypt_context *ctx, int ctx_size) { return ctx_size >= 1 && ctx_size == fscrypt_context_size(ctx); } /* Retrieve the context's nonce, assuming the context was already validated */ static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx) { switch (ctx->version) { case FSCRYPT_CONTEXT_V1: return ctx->v1.nonce; case FSCRYPT_CONTEXT_V2: return ctx->v2.nonce; } WARN_ON_ONCE(1); return NULL; } union fscrypt_policy { u8 version; struct fscrypt_policy_v1 v1; struct fscrypt_policy_v2 v2; }; /* * Return the size expected for the given fscrypt_policy based on its version * number, or 0 if the policy version is unrecognized. */ static inline int fscrypt_policy_size(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return sizeof(policy->v1); case FSCRYPT_POLICY_V2: return sizeof(policy->v2); } return 0; } /* Return the contents encryption mode of a valid encryption policy */ static inline u8 fscrypt_policy_contents_mode(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.contents_encryption_mode; case FSCRYPT_POLICY_V2: return policy->v2.contents_encryption_mode; } BUG(); } /* Return the filenames encryption mode of a valid encryption policy */ static inline u8 fscrypt_policy_fnames_mode(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.filenames_encryption_mode; case FSCRYPT_POLICY_V2: return policy->v2.filenames_encryption_mode; } BUG(); } /* Return the flags (FSCRYPT_POLICY_FLAG*) of a valid encryption policy */ static inline u8 fscrypt_policy_flags(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.flags; case FSCRYPT_POLICY_V2: return policy->v2.flags; } BUG(); } /* * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. */ struct fscrypt_symlink_data { __le16 len; char encrypted_path[]; } __packed; /** * struct fscrypt_prepared_key - a key prepared for actual encryption/decryption * @tfm: crypto API transform object * @blk_key: key for blk-crypto * * Normally only one of the fields will be non-NULL. */ struct fscrypt_prepared_key { struct crypto_skcipher *tfm; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT struct blk_crypto_key *blk_key; #endif }; /* * fscrypt_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is * allocated and stored in ->i_crypt_info. Once created, it remains until the * inode is evicted. */ struct fscrypt_info { /* The key in a form prepared for actual encryption/decryption */ struct fscrypt_prepared_key ci_enc_key; /* True if ci_enc_key should be freed when this fscrypt_info is freed */ bool ci_owns_key; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT /* * True if this inode will use inline encryption (blk-crypto) instead of * the traditional filesystem-layer encryption. */ bool ci_inlinecrypt; #endif /* * Encryption mode used for this inode. It corresponds to either the * contents or filenames encryption mode, depending on the inode type. */ struct fscrypt_mode *ci_mode; /* Back-pointer to the inode */ struct inode *ci_inode; /* * The master key with which this inode was unlocked (decrypted). This * will be NULL if the master key was found in a process-subscribed * keyring rather than in the filesystem-level keyring. */ struct fscrypt_master_key *ci_master_key; /* * Link in list of inodes that were unlocked with the master key. * Only used when ->ci_master_key is set. */ struct list_head ci_master_key_link; /* * If non-NULL, then encryption is done using the master key directly * and ci_enc_key will equal ci_direct_key->dk_key. */ struct fscrypt_direct_key *ci_direct_key; /* * This inode's hash key for filenames. This is a 128-bit SipHash-2-4 * key. This is only set for directories that use a keyed dirhash over * the plaintext filenames -- currently just casefolded directories. */ siphash_key_t ci_dirhash_key; bool ci_dirhash_key_initialized; /* The encryption policy used by this inode */ union fscrypt_policy ci_policy; /* This inode's nonce, copied from the fscrypt_context */ u8 ci_nonce[FSCRYPT_FILE_NONCE_SIZE]; /* Hashed inode number. Only set for IV_INO_LBLK_32 */ u32 ci_hashed_ino; }; typedef enum { FS_DECRYPT = 0, FS_ENCRYPT, } fscrypt_direction_t; /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; int fscrypt_initialize(struct super_block *sb); int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, u64 lblk_num, struct page *src_page, struct page *dest_page, unsigned int len, unsigned int offs, gfp_t gfp_flags); struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); void __printf(3, 4) __cold fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); #define fscrypt_warn(inode, fmt, ...) \ fscrypt_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__) #define fscrypt_err(inode, fmt, ...) \ fscrypt_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__) #define FSCRYPT_MAX_IV_SIZE 32 union fscrypt_iv { struct { /* logical block number within the file */ __le64 lblk_num; /* per-file nonce; only set in DIRECT_KEY mode */ u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; u8 raw[FSCRYPT_MAX_IV_SIZE]; __le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)]; }; void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, const struct fscrypt_info *ci); /* fname.c */ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); /* hkdf.c */ struct fscrypt_hkdf { struct crypto_shash *hmac_tfm; }; int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, unsigned int master_key_size); /* * The list of contexts in which fscrypt uses HKDF. These values are used as * the first byte of the HKDF application-specific info string to guarantee that * info strings are never repeated between contexts. This ensures that all HKDF * outputs are unique and cryptographically isolated, i.e. knowledge of one * output doesn't reveal another. */ #define HKDF_CONTEXT_KEY_IDENTIFIER 1 /* info=<empty> */ #define HKDF_CONTEXT_PER_FILE_ENC_KEY 2 /* info=file_nonce */ #define HKDF_CONTEXT_DIRECT_KEY 3 /* info=mode_num */ #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 /* info=mode_num||fs_uuid */ #define HKDF_CONTEXT_DIRHASH_KEY 5 /* info=file_nonce */ #define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 /* info=mode_num||fs_uuid */ #define HKDF_CONTEXT_INODE_HASH_KEY 7 /* info=<empty> */ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, u8 *okm, unsigned int okmlen); void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT int fscrypt_select_encryption_impl(struct fscrypt_info *ci); static inline bool fscrypt_using_inline_encryption(const struct fscrypt_info *ci) { return ci->ci_inlinecrypt; } int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_info *ci); void fscrypt_destroy_inline_crypt_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); /* * Check whether the crypto transform or blk-crypto key has been allocated in * @prep_key, depending on which encryption implementation the file will use. */ static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, const struct fscrypt_info *ci) { /* * The two smp_load_acquire()'s here pair with the smp_store_release()'s * in fscrypt_prepare_inline_crypt_key() and fscrypt_prepare_key(). * I.e., in some cases (namely, if this prep_key is a per-mode * encryption key) another task can publish blk_key or tfm concurrently, * executing a RELEASE barrier. We need to use smp_load_acquire() here * to safely ACQUIRE the memory the other task published. */ if (fscrypt_using_inline_encryption(ci)) return smp_load_acquire(&prep_key->blk_key) != NULL; return smp_load_acquire(&prep_key->tfm) != NULL; } #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ static inline int fscrypt_select_encryption_impl(struct fscrypt_info *ci) { return 0; } static inline bool fscrypt_using_inline_encryption(const struct fscrypt_info *ci) { return false; } static inline int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_info *ci) { WARN_ON_ONCE(1); return -EOPNOTSUPP; } static inline void fscrypt_destroy_inline_crypt_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key) { } static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, const struct fscrypt_info *ci) { return smp_load_acquire(&prep_key->tfm) != NULL; } #endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ /* keyring.c */ /* * fscrypt_master_key_secret - secret key material of an in-use master key */ struct fscrypt_master_key_secret { /* * For v2 policy keys: HKDF context keyed by this master key. * For v1 policy keys: not set (hkdf.hmac_tfm == NULL). */ struct fscrypt_hkdf hkdf; /* * Size of the raw key in bytes. This remains set even if ->raw was * zeroized due to no longer being needed. I.e. we still remember the * size of the key even if we don't need to remember the key itself. */ u32 size; /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */ u8 raw[FSCRYPT_MAX_KEY_SIZE]; } __randomize_layout; /* * fscrypt_master_key - an in-use master key * * This represents a master encryption key which has been added to the * filesystem and can be used to "unlock" the encrypted files which were * encrypted with it. */ struct fscrypt_master_key { /* * Link in ->s_master_keys->key_hashtable. * Only valid if ->mk_active_refs > 0. */ struct hlist_node mk_node; /* Semaphore that protects ->mk_secret and ->mk_users */ struct rw_semaphore mk_sem; /* * Active and structural reference counts. An active ref guarantees * that the struct continues to exist, continues to be in the keyring * ->s_master_keys, and that any embedded subkeys (e.g. * ->mk_direct_keys) that have been prepared continue to exist. * A structural ref only guarantees that the struct continues to exist. * * There is one active ref associated with ->mk_secret being present, * and one active ref for each inode in ->mk_decrypted_inodes. * * There is one structural ref associated with the active refcount being * nonzero. Finding a key in the keyring also takes a structural ref, * which is then held temporarily while the key is operated on. */ refcount_t mk_active_refs; refcount_t mk_struct_refs; struct rcu_head mk_rcu_head; /* * The secret key material. After FS_IOC_REMOVE_ENCRYPTION_KEY is * executed, this is wiped and no new inodes can be unlocked with this * key; however, there may still be inodes in ->mk_decrypted_inodes * which could not be evicted. As long as some inodes still remain, * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again. * * While ->mk_secret is present, one ref in ->mk_active_refs is held. * * Locking: protected by ->mk_sem. The manipulation of ->mk_active_refs * associated with this field is protected by ->mk_sem as well. */ struct fscrypt_master_key_secret mk_secret; /* * For v1 policy keys: an arbitrary key descriptor which was assigned by * userspace (->descriptor). * * For v2 policy keys: a cryptographic hash of this key (->identifier). */ struct fscrypt_key_specifier mk_spec; /* * Keyring which contains a key of type 'key_type_fscrypt_user' for each * user who has added this key. Normally each key will be added by just * one user, but it's possible that multiple users share a key, and in * that case we need to keep track of those users so that one user can't * remove the key before the others want it removed too. * * This is NULL for v1 policy keys; those can only be added by root. * * Locking: protected by ->mk_sem. (We don't just rely on the keyrings * subsystem semaphore ->mk_users->sem, as we need support for atomic * search+insert along with proper synchronization with ->mk_secret.) */ struct key *mk_users; /* * List of inodes that were unlocked using this key. This allows the * inodes to be evicted efficiently if the key is removed. */ struct list_head mk_decrypted_inodes; spinlock_t mk_decrypted_inodes_lock; /* * Per-mode encryption keys for the various types of encryption policies * that use them. Allocated and derived on-demand. */ struct fscrypt_prepared_key mk_direct_keys[FSCRYPT_MODE_MAX + 1]; struct fscrypt_prepared_key mk_iv_ino_lblk_64_keys[FSCRYPT_MODE_MAX + 1]; struct fscrypt_prepared_key mk_iv_ino_lblk_32_keys[FSCRYPT_MODE_MAX + 1]; /* Hash key for inode numbers. Initialized only when needed. */ siphash_key_t mk_ino_hash_key; bool mk_ino_hash_key_initialized; } __randomize_layout; static inline bool is_master_key_secret_present(const struct fscrypt_master_key_secret *secret) { /* * The READ_ONCE() is only necessary for fscrypt_drop_inode(). * fscrypt_drop_inode() runs in atomic context, so it can't take the key * semaphore and thus 'secret' can change concurrently which would be a * data race. But fscrypt_drop_inode() only need to know whether the * secret *was* present at the time of check, so READ_ONCE() suffices. */ return READ_ONCE(secret->size) != 0; } static inline const char *master_key_spec_type( const struct fscrypt_key_specifier *spec) { switch (spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: return "descriptor"; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: return "identifier"; } return "[unknown]"; } static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) { switch (spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: return FSCRYPT_KEY_DESCRIPTOR_SIZE; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: return FSCRYPT_KEY_IDENTIFIER_SIZE; } return 0; } void fscrypt_put_master_key(struct fscrypt_master_key *mk); void fscrypt_put_master_key_activeref(struct super_block *sb, struct fscrypt_master_key *mk); struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); int fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int fscrypt_add_test_dummy_key(struct super_block *sb, struct fscrypt_key_specifier *key_spec); int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int __init fscrypt_init_keyring(void); /* keysetup.c */ struct fscrypt_mode { const char *friendly_name; const char *cipher_str; int keysize; /* key size in bytes */ int security_strength; /* security strength in bytes */ int ivsize; /* IV size in bytes */ int logged_cryptoapi_impl; int logged_blk_crypto_native; int logged_blk_crypto_fallback; enum blk_crypto_mode_num blk_crypto_mode; }; extern struct fscrypt_mode fscrypt_modes[]; int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_info *ci); void fscrypt_destroy_prepared_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, const struct fscrypt_master_key *mk); void fscrypt_hash_inode_number(struct fscrypt_info *ci, const struct fscrypt_master_key *mk); int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported); /** * fscrypt_require_key() - require an inode's encryption key * @inode: the inode we need the key for * * If the inode is encrypted, set up its encryption key if not already done. * Then require that the key be present and return -ENOKEY otherwise. * * No locks are needed, and the key will live as long as the struct inode --- so * it won't go away from under you. * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code * if a problem occurred while setting up the encryption key. */ static inline int fscrypt_require_key(struct inode *inode) { if (IS_ENCRYPTED(inode)) { int err = fscrypt_get_encryption_info(inode, false); if (err) return err; if (!fscrypt_has_encryption_key(inode)) return -ENOKEY; } return 0; } /* keysetup_v1.c */ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key); int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci); /* policy.c */ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, const union fscrypt_policy *policy2); int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, struct fscrypt_key_specifier *key_spec); const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb); bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode); int fscrypt_policy_from_context(union fscrypt_policy *policy_u, const union fscrypt_context *ctx_u, int ctx_size); const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir); #endif /* _FSCRYPT_PRIVATE_H */
1167 625 33 129 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 /* SPDX-License-Identifier: GPL-2.0-only */ /* * pm_runtime.h - Device run-time power management helper functions. * * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl> */ #ifndef _LINUX_PM_RUNTIME_H #define _LINUX_PM_RUNTIME_H #include <linux/device.h> #include <linux/notifier.h> #include <linux/pm.h> #include <linux/jiffies.h> /* Runtime PM flag argument bits */ #define RPM_ASYNC 0x01 /* Request is asynchronous */ #define RPM_NOWAIT 0x02 /* Don't wait for concurrent state change */ #define RPM_GET_PUT 0x04 /* Increment/decrement the usage_count */ #define RPM_AUTO 0x08 /* Use autosuspend_delay */ /* * Use this for defining a set of PM operations to be used in all situations * (system suspend, hibernation or runtime PM). * * Note that the behaviour differs from the deprecated UNIVERSAL_DEV_PM_OPS() * macro, which uses the provided callbacks for both runtime PM and system * sleep, while DEFINE_RUNTIME_DEV_PM_OPS() uses pm_runtime_force_suspend() * and pm_runtime_force_resume() for its system sleep callbacks. * * If the underlying dev_pm_ops struct symbol has to be exported, use * EXPORT_RUNTIME_DEV_PM_OPS() or EXPORT_GPL_RUNTIME_DEV_PM_OPS() instead. */ #define DEFINE_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ _DEFINE_DEV_PM_OPS(name, pm_runtime_force_suspend, \ pm_runtime_force_resume, suspend_fn, \ resume_fn, idle_fn) #define EXPORT_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ EXPORT_DEV_PM_OPS(name) = { \ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ } #define EXPORT_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn) \ EXPORT_GPL_DEV_PM_OPS(name) = { \ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ } #define EXPORT_NS_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \ EXPORT_NS_DEV_PM_OPS(name, ns) = { \ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ } #define EXPORT_NS_GPL_RUNTIME_DEV_PM_OPS(name, suspend_fn, resume_fn, idle_fn, ns) \ EXPORT_NS_GPL_DEV_PM_OPS(name, ns) = { \ RUNTIME_PM_OPS(suspend_fn, resume_fn, idle_fn) \ } #ifdef CONFIG_PM extern struct workqueue_struct *pm_wq; static inline bool queue_pm_work(struct work_struct *work) { return queue_work(pm_wq, work); } extern int pm_generic_runtime_suspend(struct device *dev); extern int pm_generic_runtime_resume(struct device *dev); extern int pm_runtime_force_suspend(struct device *dev); extern int pm_runtime_force_resume(struct device *dev); extern int __pm_runtime_idle(struct device *dev, int rpmflags); extern int __pm_runtime_suspend(struct device *dev, int rpmflags); extern int __pm_runtime_resume(struct device *dev, int rpmflags); extern int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count); extern int pm_schedule_suspend(struct device *dev, unsigned int delay); extern int __pm_runtime_set_status(struct device *dev, unsigned int status); extern int pm_runtime_barrier(struct device *dev); extern void pm_runtime_enable(struct device *dev); extern void __pm_runtime_disable(struct device *dev, bool check_resume); extern void pm_runtime_allow(struct device *dev); extern void pm_runtime_forbid(struct device *dev); extern void pm_runtime_no_callbacks(struct device *dev); extern void pm_runtime_irq_safe(struct device *dev); extern void __pm_runtime_use_autosuspend(struct device *dev, bool use); extern void pm_runtime_set_autosuspend_delay(struct device *dev, int delay); extern u64 pm_runtime_autosuspend_expiration(struct device *dev); extern void pm_runtime_set_memalloc_noio(struct device *dev, bool enable); extern void pm_runtime_get_suppliers(struct device *dev); extern void pm_runtime_put_suppliers(struct device *dev); extern void pm_runtime_new_link(struct device *dev); extern void pm_runtime_drop_link(struct device_link *link); extern void pm_runtime_release_supplier(struct device_link *link); extern int devm_pm_runtime_enable(struct device *dev); /** * pm_runtime_get_if_in_use - Conditionally bump up runtime PM usage counter. * @dev: Target device. * * Increment the runtime PM usage counter of @dev if its runtime PM status is * %RPM_ACTIVE and its runtime PM usage counter is greater than 0. */ static inline int pm_runtime_get_if_in_use(struct device *dev) { return pm_runtime_get_if_active(dev, false); } /** * pm_suspend_ignore_children - Set runtime PM behavior regarding children. * @dev: Target device. * @enable: Whether or not to ignore possible dependencies on children. * * The dependencies of @dev on its children will not be taken into account by * the runtime PM framework going forward if @enable is %true, or they will * be taken into account otherwise. */ static inline void pm_suspend_ignore_children(struct device *dev, bool enable) { dev->power.ignore_children = enable; } /** * pm_runtime_get_noresume - Bump up runtime PM usage counter of a device. * @dev: Target device. */ static inline void pm_runtime_get_noresume(struct device *dev) { atomic_inc(&dev->power.usage_count); } /** * pm_runtime_put_noidle - Drop runtime PM usage counter of a device. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev unless it is 0 already. */ static inline void pm_runtime_put_noidle(struct device *dev) { atomic_add_unless(&dev->power.usage_count, -1, 0); } /** * pm_runtime_suspended - Check whether or not a device is runtime-suspended. * @dev: Target device. * * Return %true if runtime PM is enabled for @dev and its runtime PM status is * %RPM_SUSPENDED, or %false otherwise. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which * runtime PM cannot be either disabled or enabled for @dev and its runtime PM * status cannot change. */ static inline bool pm_runtime_suspended(struct device *dev) { return dev->power.runtime_status == RPM_SUSPENDED && !dev->power.disable_depth; } /** * pm_runtime_active - Check whether or not a device is runtime-active. * @dev: Target device. * * Return %true if runtime PM is disabled for @dev or its runtime PM status is * %RPM_ACTIVE, or %false otherwise. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which * runtime PM cannot be either disabled or enabled for @dev and its runtime PM * status cannot change. */ static inline bool pm_runtime_active(struct device *dev) { return dev->power.runtime_status == RPM_ACTIVE || dev->power.disable_depth; } /** * pm_runtime_status_suspended - Check if runtime PM status is "suspended". * @dev: Target device. * * Return %true if the runtime PM status of @dev is %RPM_SUSPENDED, or %false * otherwise, regardless of whether or not runtime PM has been enabled for @dev. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which the * runtime PM status of @dev cannot change. */ static inline bool pm_runtime_status_suspended(struct device *dev) { return dev->power.runtime_status == RPM_SUSPENDED; } /** * pm_runtime_enabled - Check if runtime PM is enabled. * @dev: Target device. * * Return %true if runtime PM is enabled for @dev or %false otherwise. * * Note that the return value of this function can only be trusted if it is * called under the runtime PM lock of @dev or under conditions in which * runtime PM cannot be either disabled or enabled for @dev. */ static inline bool pm_runtime_enabled(struct device *dev) { return !dev->power.disable_depth; } /** * pm_runtime_has_no_callbacks - Check if runtime PM callbacks may be present. * @dev: Target device. * * Return %true if @dev is a special device without runtime PM callbacks or * %false otherwise. */ static inline bool pm_runtime_has_no_callbacks(struct device *dev) { return dev->power.no_callbacks; } /** * pm_runtime_mark_last_busy - Update the last access time of a device. * @dev: Target device. * * Update the last access time of @dev used by the runtime PM autosuspend * mechanism to the current time as returned by ktime_get_mono_fast_ns(). */ static inline void pm_runtime_mark_last_busy(struct device *dev) { WRITE_ONCE(dev->power.last_busy, ktime_get_mono_fast_ns()); } /** * pm_runtime_is_irq_safe - Check if runtime PM can work in interrupt context. * @dev: Target device. * * Return %true if @dev has been marked as an "IRQ-safe" device (with respect * to runtime PM), in which case its runtime PM callabcks can be expected to * work correctly when invoked from interrupt handlers. */ static inline bool pm_runtime_is_irq_safe(struct device *dev) { return dev->power.irq_safe; } extern u64 pm_runtime_suspended_time(struct device *dev); #else /* !CONFIG_PM */ static inline bool queue_pm_work(struct work_struct *work) { return false; } static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; } static inline int pm_generic_runtime_resume(struct device *dev) { return 0; } static inline int pm_runtime_force_suspend(struct device *dev) { return 0; } static inline int pm_runtime_force_resume(struct device *dev) { return 0; } static inline int __pm_runtime_idle(struct device *dev, int rpmflags) { return -ENOSYS; } static inline int __pm_runtime_suspend(struct device *dev, int rpmflags) { return -ENOSYS; } static inline int __pm_runtime_resume(struct device *dev, int rpmflags) { return 1; } static inline int pm_schedule_suspend(struct device *dev, unsigned int delay) { return -ENOSYS; } static inline int pm_runtime_get_if_in_use(struct device *dev) { return -EINVAL; } static inline int pm_runtime_get_if_active(struct device *dev, bool ign_usage_count) { return -EINVAL; } static inline int __pm_runtime_set_status(struct device *dev, unsigned int status) { return 0; } static inline int pm_runtime_barrier(struct device *dev) { return 0; } static inline void pm_runtime_enable(struct device *dev) {} static inline void __pm_runtime_disable(struct device *dev, bool c) {} static inline void pm_runtime_allow(struct device *dev) {} static inline void pm_runtime_forbid(struct device *dev) {} static inline int devm_pm_runtime_enable(struct device *dev) { return 0; } static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {} static inline void pm_runtime_get_noresume(struct device *dev) {} static inline void pm_runtime_put_noidle(struct device *dev) {} static inline bool pm_runtime_suspended(struct device *dev) { return false; } static inline bool pm_runtime_active(struct device *dev) { return true; } static inline bool pm_runtime_status_suspended(struct device *dev) { return false; } static inline bool pm_runtime_enabled(struct device *dev) { return false; } static inline void pm_runtime_no_callbacks(struct device *dev) {} static inline void pm_runtime_irq_safe(struct device *dev) {} static inline bool pm_runtime_is_irq_safe(struct device *dev) { return false; } static inline bool pm_runtime_has_no_callbacks(struct device *dev) { return false; } static inline void pm_runtime_mark_last_busy(struct device *dev) {} static inline void __pm_runtime_use_autosuspend(struct device *dev, bool use) {} static inline void pm_runtime_set_autosuspend_delay(struct device *dev, int delay) {} static inline u64 pm_runtime_autosuspend_expiration( struct device *dev) { return 0; } static inline void pm_runtime_set_memalloc_noio(struct device *dev, bool enable){} static inline void pm_runtime_get_suppliers(struct device *dev) {} static inline void pm_runtime_put_suppliers(struct device *dev) {} static inline void pm_runtime_new_link(struct device *dev) {} static inline void pm_runtime_drop_link(struct device_link *link) {} static inline void pm_runtime_release_supplier(struct device_link *link) {} #endif /* !CONFIG_PM */ /** * pm_runtime_idle - Conditionally set up autosuspend of a device or suspend it. * @dev: Target device. * * Invoke the "idle check" callback of @dev and, depending on its return value, * set up autosuspend of @dev or suspend it (depending on whether or not * autosuspend has been enabled for it). */ static inline int pm_runtime_idle(struct device *dev) { return __pm_runtime_idle(dev, 0); } /** * pm_runtime_suspend - Suspend a device synchronously. * @dev: Target device. */ static inline int pm_runtime_suspend(struct device *dev) { return __pm_runtime_suspend(dev, 0); } /** * pm_runtime_autosuspend - Set up autosuspend of a device or suspend it. * @dev: Target device. * * Set up autosuspend of @dev or suspend it (depending on whether or not * autosuspend is enabled for it) without engaging its "idle check" callback. */ static inline int pm_runtime_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_AUTO); } /** * pm_runtime_resume - Resume a device synchronously. * @dev: Target device. */ static inline int pm_runtime_resume(struct device *dev) { return __pm_runtime_resume(dev, 0); } /** * pm_request_idle - Queue up "idle check" execution for a device. * @dev: Target device. * * Queue up a work item to run an equivalent of pm_runtime_idle() for @dev * asynchronously. */ static inline int pm_request_idle(struct device *dev) { return __pm_runtime_idle(dev, RPM_ASYNC); } /** * pm_request_resume - Queue up runtime-resume of a device. * @dev: Target device. */ static inline int pm_request_resume(struct device *dev) { return __pm_runtime_resume(dev, RPM_ASYNC); } /** * pm_request_autosuspend - Queue up autosuspend of a device. * @dev: Target device. * * Queue up a work item to run an equivalent pm_runtime_autosuspend() for @dev * asynchronously. */ static inline int pm_request_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_ASYNC | RPM_AUTO); } /** * pm_runtime_get - Bump up usage counter and queue up resume of a device. * @dev: Target device. * * Bump up the runtime PM usage counter of @dev and queue up a work item to * carry out runtime-resume of it. */ static inline int pm_runtime_get(struct device *dev) { return __pm_runtime_resume(dev, RPM_GET_PUT | RPM_ASYNC); } /** * pm_runtime_get_sync - Bump up usage counter of a device and resume it. * @dev: Target device. * * Bump up the runtime PM usage counter of @dev and carry out runtime-resume of * it synchronously. * * The possible return values of this function are the same as for * pm_runtime_resume() and the runtime PM usage counter of @dev remains * incremented in all cases, even if it returns an error code. * Consider using pm_runtime_resume_and_get() instead of it, especially * if its return value is checked by the caller, as this is likely to result * in cleaner code. */ static inline int pm_runtime_get_sync(struct device *dev) { return __pm_runtime_resume(dev, RPM_GET_PUT); } /** * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it. * @dev: Target device. * * Resume @dev synchronously and if that is successful, increment its runtime * PM usage counter. Return 0 if the runtime PM usage counter of @dev has been * incremented or a negative error code otherwise. */ static inline int pm_runtime_resume_and_get(struct device *dev) { int ret; ret = __pm_runtime_resume(dev, RPM_GET_PUT); if (ret < 0) { pm_runtime_put_noidle(dev); return ret; } return 0; } /** * pm_runtime_put - Drop device usage counter and queue up "idle check" if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, queue up a work item for @dev like in pm_request_idle(). */ static inline int pm_runtime_put(struct device *dev) { return __pm_runtime_idle(dev, RPM_GET_PUT | RPM_ASYNC); } /** * pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, queue up a work item for @dev like in pm_request_autosuspend(). */ static inline int pm_runtime_put_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_ASYNC | RPM_AUTO); } /** * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, invoke the "idle check" callback of @dev and, depending on its * return value, set up autosuspend of @dev or suspend it (depending on whether * or not autosuspend has been enabled for it). * * The possible return values of this function are the same as for * pm_runtime_idle() and the runtime PM usage counter of @dev remains * decremented in all cases, even if it returns an error code. */ static inline int pm_runtime_put_sync(struct device *dev) { return __pm_runtime_idle(dev, RPM_GET_PUT); } /** * pm_runtime_put_sync_suspend - Drop device usage counter and suspend if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, carry out runtime-suspend of @dev synchronously. * * The possible return values of this function are the same as for * pm_runtime_suspend() and the runtime PM usage counter of @dev remains * decremented in all cases, even if it returns an error code. */ static inline int pm_runtime_put_sync_suspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_GET_PUT); } /** * pm_runtime_put_sync_autosuspend - Drop device usage counter and autosuspend if 0. * @dev: Target device. * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, set up autosuspend of @dev or suspend it synchronously (depending * on whether or not autosuspend has been enabled for it). * * The possible return values of this function are the same as for * pm_runtime_autosuspend() and the runtime PM usage counter of @dev remains * decremented in all cases, even if it returns an error code. */ static inline int pm_runtime_put_sync_autosuspend(struct device *dev) { return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_AUTO); } /** * pm_runtime_set_active - Set runtime PM status to "active". * @dev: Target device. * * Set the runtime PM status of @dev to %RPM_ACTIVE and ensure that dependencies * of it will be taken into account. * * It is not valid to call this function for devices with runtime PM enabled. */ static inline int pm_runtime_set_active(struct device *dev) { return __pm_runtime_set_status(dev, RPM_ACTIVE); } /** * pm_runtime_set_suspended - Set runtime PM status to "suspended". * @dev: Target device. * * Set the runtime PM status of @dev to %RPM_SUSPENDED and ensure that * dependencies of it will be taken into account. * * It is not valid to call this function for devices with runtime PM enabled. */ static inline int pm_runtime_set_suspended(struct device *dev) { return __pm_runtime_set_status(dev, RPM_SUSPENDED); } /** * pm_runtime_disable - Disable runtime PM for a device. * @dev: Target device. * * Prevent the runtime PM framework from working with @dev (by incrementing its * "blocking" counter). * * For each invocation of this function for @dev there must be a matching * pm_runtime_enable() call in order for runtime PM to be enabled for it. */ static inline void pm_runtime_disable(struct device *dev) { __pm_runtime_disable(dev, true); } /** * pm_runtime_use_autosuspend - Allow autosuspend to be used for a device. * @dev: Target device. * * Allow the runtime PM autosuspend mechanism to be used for @dev whenever * requested (or "autosuspend" will be handled as direct runtime-suspend for * it). * * NOTE: It's important to undo this with pm_runtime_dont_use_autosuspend() * at driver exit time unless your driver initially enabled pm_runtime * with devm_pm_runtime_enable() (which handles it for you). */ static inline void pm_runtime_use_autosuspend(struct device *dev) { __pm_runtime_use_autosuspend(dev, true); } /** * pm_runtime_dont_use_autosuspend - Prevent autosuspend from being used. * @dev: Target device. * * Prevent the runtime PM autosuspend mechanism from being used for @dev which * means that "autosuspend" will be handled as direct runtime-suspend for it * going forward. */ static inline void pm_runtime_dont_use_autosuspend(struct device *dev) { __pm_runtime_use_autosuspend(dev, false); } #endif
4 4 4 4 4 8 8 8 8 8 8 8 8 6 6 6 6 6 29 29 29 29 29 29 29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2015 Intel Deutschland GmbH * Copyright (C) 2022 Intel Corporation */ #include <net/mac80211.h> #include "ieee80211_i.h" #include "trace.h" #include "driver-ops.h" #include "debugfs_sta.h" #include "debugfs_netdev.h" int drv_start(struct ieee80211_local *local) { int ret; might_sleep(); if (WARN_ON(local->started)) return -EALREADY; trace_drv_start(local); local->started = true; /* allow rx frames */ smp_mb(); ret = local->ops->start(&local->hw); trace_drv_return_int(local, ret); if (ret) local->started = false; return ret; } void drv_stop(struct ieee80211_local *local) { might_sleep(); if (WARN_ON(!local->started)) return; trace_drv_stop(local); local->ops->stop(&local->hw); trace_drv_return_void(local); /* sync away all work on the tasklet before clearing started */ tasklet_disable(&local->tasklet); tasklet_enable(&local->tasklet); barrier(); local->started = false; } int drv_add_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { int ret; might_sleep(); if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_AP_VLAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && !ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && !(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE)))) return -EINVAL; trace_drv_add_interface(local, sdata); ret = local->ops->add_interface(&local->hw, &sdata->vif); trace_drv_return_int(local, ret); if (ret == 0) sdata->flags |= IEEE80211_SDATA_IN_DRIVER; return ret; } int drv_change_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type, bool p2p) { int ret; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_change_interface(local, sdata, type, p2p); ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p); trace_drv_return_int(local, ret); return ret; } void drv_remove_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; trace_drv_remove_interface(local, sdata); local->ops->remove_interface(&local->hw, &sdata->vif); sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; trace_drv_return_void(local); } __must_check int drv_sta_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta, enum ieee80211_sta_state old_state, enum ieee80211_sta_state new_state) { int ret = 0; might_sleep(); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sta_state(local, sdata, &sta->sta, old_state, new_state); if (local->ops->sta_state) { ret = local->ops->sta_state(&local->hw, &sdata->vif, &sta->sta, old_state, new_state); } else if (old_state == IEEE80211_STA_AUTH && new_state == IEEE80211_STA_ASSOC) { ret = drv_sta_add(local, sdata, &sta->sta); if (ret == 0) { sta->uploaded = true; if (rcu_access_pointer(sta->sta.rates)) drv_sta_rate_tbl_update(local, sdata, &sta->sta); } } else if (old_state == IEEE80211_STA_ASSOC && new_state == IEEE80211_STA_AUTH) { drv_sta_remove(local, sdata, &sta->sta); } trace_drv_return_int(local, ret); return ret; } __must_check int drv_sta_set_txpwr(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { int ret = -EOPNOTSUPP; might_sleep(); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_sta_set_txpwr(local, sdata, &sta->sta); if (local->ops->sta_set_txpwr) ret = local->ops->sta_set_txpwr(&local->hw, &sdata->vif, &sta->sta); trace_drv_return_int(local, ret); return ret; } void drv_sta_rc_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u32 changed) { sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return; WARN_ON(changed & IEEE80211_RC_SUPP_RATES_CHANGED && (sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT)); trace_drv_sta_rc_update(local, sdata, sta, changed); if (local->ops->sta_rc_update) local->ops->sta_rc_update(&local->hw, &sdata->vif, sta, changed); trace_drv_return_void(local); } int drv_conf_tx(struct ieee80211_local *local, struct ieee80211_link_data *link, u16 ac, const struct ieee80211_tx_queue_params *params) { struct ieee80211_sub_if_data *sdata = link->sdata; int ret = -EOPNOTSUPP; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; if (sdata->vif.active_links && !(sdata->vif.active_links & BIT(link->link_id))) return 0; if (params->cw_min == 0 || params->cw_min > params->cw_max) { /* * If we can't configure hardware anyway, don't warn. We may * never have initialized the CW parameters. */ WARN_ONCE(local->ops->conf_tx, "%s: invalid CW_min/CW_max: %d/%d\n", sdata->name, params->cw_min, params->cw_max); return -EINVAL; } trace_drv_conf_tx(local, sdata, link->link_id, ac, params); if (local->ops->conf_tx) ret = local->ops->conf_tx(&local->hw, &sdata->vif, link->link_id, ac, params); trace_drv_return_int(local, ret); return ret; } u64 drv_get_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { u64 ret = -1ULL; might_sleep(); if (!check_sdata_in_driver(sdata)) return ret; trace_drv_get_tsf(local, sdata); if (local->ops->get_tsf) ret = local->ops->get_tsf(&local->hw, &sdata->vif); trace_drv_return_u64(local, ret); return ret; } void drv_set_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u64 tsf) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; trace_drv_set_tsf(local, sdata, tsf); if (local->ops->set_tsf) local->ops->set_tsf(&local->hw, &sdata->vif, tsf); trace_drv_return_void(local); } void drv_offset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, s64 offset) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; trace_drv_offset_tsf(local, sdata, offset); if (local->ops->offset_tsf) local->ops->offset_tsf(&local->hw, &sdata->vif, offset); trace_drv_return_void(local); } void drv_reset_tsf(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { might_sleep(); if (!check_sdata_in_driver(sdata)) return; trace_drv_reset_tsf(local, sdata); if (local->ops->reset_tsf) local->ops->reset_tsf(&local->hw, &sdata->vif); trace_drv_return_void(local); } int drv_assign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx *ctx) { int ret = 0; drv_verify_link_exists(sdata, link_conf); if (!check_sdata_in_driver(sdata)) return -EIO; if (sdata->vif.active_links && !(sdata->vif.active_links & BIT(link_conf->link_id))) return 0; trace_drv_assign_vif_chanctx(local, sdata, link_conf, ctx); if (local->ops->assign_vif_chanctx) { WARN_ON_ONCE(!ctx->driver_present); ret = local->ops->assign_vif_chanctx(&local->hw, &sdata->vif, link_conf, &ctx->conf); } trace_drv_return_int(local, ret); return ret; } void drv_unassign_vif_chanctx(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf, struct ieee80211_chanctx *ctx) { might_sleep(); drv_verify_link_exists(sdata, link_conf); if (!check_sdata_in_driver(sdata)) return; if (sdata->vif.active_links && !(sdata->vif.active_links & BIT(link_conf->link_id))) return; trace_drv_unassign_vif_chanctx(local, sdata, link_conf, ctx); if (local->ops->unassign_vif_chanctx) { WARN_ON_ONCE(!ctx->driver_present); local->ops->unassign_vif_chanctx(&local->hw, &sdata->vif, link_conf, &ctx->conf); } trace_drv_return_void(local); } int drv_switch_vif_chanctx(struct ieee80211_local *local, struct ieee80211_vif_chanctx_switch *vifs, int n_vifs, enum ieee80211_chanctx_switch_mode mode) { int ret = 0; int i; might_sleep(); if (!local->ops->switch_vif_chanctx) return -EOPNOTSUPP; for (i = 0; i < n_vifs; i++) { struct ieee80211_chanctx *new_ctx = container_of(vifs[i].new_ctx, struct ieee80211_chanctx, conf); struct ieee80211_chanctx *old_ctx = container_of(vifs[i].old_ctx, struct ieee80211_chanctx, conf); WARN_ON_ONCE(!old_ctx->driver_present); WARN_ON_ONCE((mode == CHANCTX_SWMODE_SWAP_CONTEXTS && new_ctx->driver_present) || (mode == CHANCTX_SWMODE_REASSIGN_VIF && !new_ctx->driver_present)); } trace_drv_switch_vif_chanctx(local, vifs, n_vifs, mode); ret = local->ops->switch_vif_chanctx(&local->hw, vifs, n_vifs, mode); trace_drv_return_int(local, ret); if (!ret && mode == CHANCTX_SWMODE_SWAP_CONTEXTS) { for (i = 0; i < n_vifs; i++) { struct ieee80211_chanctx *new_ctx = container_of(vifs[i].new_ctx, struct ieee80211_chanctx, conf); struct ieee80211_chanctx *old_ctx = container_of(vifs[i].old_ctx, struct ieee80211_chanctx, conf); new_ctx->driver_present = true; old_ctx->driver_present = false; } } return ret; } int drv_ampdu_action(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_ampdu_params *params) { int ret = -EOPNOTSUPP; might_sleep(); if (!sdata) return -EIO; sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; trace_drv_ampdu_action(local, sdata, params); if (local->ops->ampdu_action) ret = local->ops->ampdu_action(&local->hw, &sdata->vif, params); trace_drv_return_int(local, ret); return ret; } void drv_link_info_changed(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *info, int link_id, u64 changed) { might_sleep(); if (WARN_ON_ONCE(changed & (BSS_CHANGED_BEACON | BSS_CHANGED_BEACON_ENABLED) && sdata->vif.type != NL80211_IFTYPE_AP && sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT && sdata->vif.type != NL80211_IFTYPE_OCB)) return; if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || sdata->vif.type == NL80211_IFTYPE_NAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && !sdata->vif.bss_conf.mu_mimo_owner && !(changed & BSS_CHANGED_TXPOWER)))) return; if (!check_sdata_in_driver(sdata)) return; if (sdata->vif.active_links && !(sdata->vif.active_links & BIT(link_id))) return; trace_drv_link_info_changed(local, sdata, info, changed); if (local->ops->link_info_changed) local->ops->link_info_changed(&local->hw, &sdata->vif, info, changed); else if (local->ops->bss_info_changed) local->ops->bss_info_changed(&local->hw, &sdata->vif, info, changed); trace_drv_return_void(local); } int drv_set_key(struct ieee80211_local *local, enum set_key_cmd cmd, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_key_conf *key) { int ret; might_sleep(); sdata = get_bss_sdata(sdata); if (!check_sdata_in_driver(sdata)) return -EIO; if (WARN_ON(key->link_id >= 0 && sdata->vif.active_links && !(sdata->vif.active_links & BIT(key->link_id)))) return -ENOLINK; trace_drv_set_key(local, cmd, sdata, sta, key); ret = local->ops->set_key(&local->hw, cmd, &sdata->vif, sta, key); trace_drv_return_int(local, ret); return ret; } int drv_change_vif_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, u16 old_links, u16 new_links, struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]) { struct ieee80211_link_data *link; unsigned long links_to_add; unsigned long links_to_rem; unsigned int link_id; int ret = -EOPNOTSUPP; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; if (old_links == new_links) return 0; links_to_add = ~old_links & new_links; links_to_rem = old_links & ~new_links; for_each_set_bit(link_id, &links_to_rem, IEEE80211_MLD_MAX_NUM_LINKS) { link = rcu_access_pointer(sdata->link[link_id]); ieee80211_link_debugfs_drv_remove(link); } trace_drv_change_vif_links(local, sdata, old_links, new_links); if (local->ops->change_vif_links) ret = local->ops->change_vif_links(&local->hw, &sdata->vif, old_links, new_links, old); trace_drv_return_int(local, ret); if (ret) return ret; for_each_set_bit(link_id, &links_to_add, IEEE80211_MLD_MAX_NUM_LINKS) { link = rcu_access_pointer(sdata->link[link_id]); ieee80211_link_debugfs_drv_add(link); } return 0; } int drv_change_sta_links(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, u16 old_links, u16 new_links) { struct sta_info *info = container_of(sta, struct sta_info, sta); struct link_sta_info *link_sta; unsigned long links_to_add; unsigned long links_to_rem; unsigned int link_id; int ret = -EOPNOTSUPP; might_sleep(); if (!check_sdata_in_driver(sdata)) return -EIO; old_links &= sdata->vif.active_links; new_links &= sdata->vif.active_links; if (old_links == new_links) return 0; links_to_add = ~old_links & new_links; links_to_rem = old_links & ~new_links; for_each_set_bit(link_id, &links_to_rem, IEEE80211_MLD_MAX_NUM_LINKS) { link_sta = rcu_dereference_protected(info->link[link_id], lockdep_is_held(&local->sta_mtx)); ieee80211_link_sta_debugfs_drv_remove(link_sta); } trace_drv_change_sta_links(local, sdata, sta, old_links, new_links); if (local->ops->change_sta_links) ret = local->ops->change_sta_links(&local->hw, &sdata->vif, sta, old_links, new_links); trace_drv_return_int(local, ret); if (ret) return ret; for_each_set_bit(link_id, &links_to_add, IEEE80211_MLD_MAX_NUM_LINKS) { link_sta = rcu_dereference_protected(info->link[link_id], lockdep_is_held(&local->sta_mtx)); ieee80211_link_sta_debugfs_drv_add(link_sta); } return 0; }
1099 1099 10153 11536 2894 28 8122 471 472 2022 2022 2022 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 // SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/capability.c * * Copyright (C) 1997 Andrew Main <zefram@fysh.org> * * Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org> * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/audit.h> #include <linux/capability.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> int file_caps_enabled = 1; static int __init file_caps_disable(char *str) { file_caps_enabled = 0; return 1; } __setup("no_file_caps", file_caps_disable); #ifdef CONFIG_MULTIUSER /* * More recent versions of libcap are available from: * * http://www.kernel.org/pub/linux/libs/security/linux-privs/ */ static void warn_legacy_capability_use(void) { char name[sizeof(current->comm)]; pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n", get_task_comm(name, current)); } /* * Version 2 capabilities worked fine, but the linux/capability.h file * that accompanied their introduction encouraged their use without * the necessary user-space source code changes. As such, we have * created a version 3 with equivalent functionality to version 2, but * with a header change to protect legacy source code from using * version 2 when it wanted to use version 1. If your system has code * that trips the following warning, it is using version 2 specific * capabilities and may be doing so insecurely. * * The remedy is to either upgrade your version of libcap (to 2.10+, * if the application is linked against it), or recompile your * application with modern kernel headers and this warning will go * away. */ static void warn_deprecated_v2(void) { char name[sizeof(current->comm)]; pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n", get_task_comm(name, current)); } /* * Version check. Return the number of u32s in each capability flag * array, or a negative value on error. */ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) { __u32 version; if (get_user(version, &header->version)) return -EFAULT; switch (version) { case _LINUX_CAPABILITY_VERSION_1: warn_legacy_capability_use(); *tocopy = _LINUX_CAPABILITY_U32S_1; break; case _LINUX_CAPABILITY_VERSION_2: warn_deprecated_v2(); fallthrough; /* v3 is otherwise equivalent to v2 */ case _LINUX_CAPABILITY_VERSION_3: *tocopy = _LINUX_CAPABILITY_U32S_3; break; default: if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) return -EFAULT; return -EINVAL; } return 0; } /* * The only thing that can change the capabilities of the current * process is the current process. As such, we can't be in this code * at the same time as we are in the process of setting capabilities * in this process. The net result is that we can limit our use of * locks to when we are reading the caps of another process. */ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, kernel_cap_t *pIp, kernel_cap_t *pPp) { int ret; if (pid && (pid != task_pid_vnr(current))) { const struct task_struct *target; rcu_read_lock(); target = find_task_by_vpid(pid); if (!target) ret = -ESRCH; else ret = security_capget(target, pEp, pIp, pPp); rcu_read_unlock(); } else ret = security_capget(current, pEp, pIp, pPp); return ret; } /** * sys_capget - get the capabilities of a given process. * @header: pointer to struct that contains capability version and * target pid data * @dataptr: pointer to struct that contains the effective, permitted, * and inheritable capabilities that are returned * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) { int ret = 0; pid_t pid; unsigned tocopy; kernel_cap_t pE, pI, pP; struct __user_cap_data_struct kdata[2]; ret = cap_validate_magic(header, &tocopy); if ((dataptr == NULL) || (ret != 0)) return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret; if (get_user(pid, &header->pid)) return -EFAULT; if (pid < 0) return -EINVAL; ret = cap_get_target_pid(pid, &pE, &pI, &pP); if (ret) return ret; /* * Annoying legacy format with 64-bit capabilities exposed * as two sets of 32-bit fields, so we need to split the * capability values up. */ kdata[0].effective = pE.val; kdata[1].effective = pE.val >> 32; kdata[0].permitted = pP.val; kdata[1].permitted = pP.val >> 32; kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32; /* * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, * we silently drop the upper capabilities here. This * has the effect of making older libcap * implementations implicitly drop upper capability * bits when they perform a: capget/modify/capset * sequence. * * This behavior is considered fail-safe * behavior. Upgrading the application to a newer * version of libcap will enable access to the newer * capabilities. * * An alternative would be to return an error here * (-ERANGE), but that causes legacy applications to * unexpectedly fail; the capget/modify/capset aborts * before modification is attempted and the application * fails. */ if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0]))) return -EFAULT; return 0; } static kernel_cap_t mk_kernel_cap(u32 low, u32 high) { return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK }; } /** * sys_capset - set capabilities for a process or (*) a group of processes * @header: pointer to struct that contains capability version and * target pid data * @data: pointer to struct that contains the effective, permitted, * and inheritable capabilities * * Set capabilities for the current process only. The ability to any other * process(es) has been deprecated and removed. * * The restrictions on setting capabilities are specified as: * * I: any raised capabilities must be a subset of the old permitted * P: any raised capabilities must be a subset of the old permitted * E: must be set to a subset of new permitted * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) { struct __user_cap_data_struct kdata[2] = { { 0, }, }; unsigned tocopy, copybytes; kernel_cap_t inheritable, permitted, effective; struct cred *new; int ret; pid_t pid; ret = cap_validate_magic(header, &tocopy); if (ret != 0) return ret; if (get_user(pid, &header->pid)) return -EFAULT; /* may only affect current now */ if (pid != 0 && pid != task_pid_vnr(current)) return -EPERM; copybytes = tocopy * sizeof(struct __user_cap_data_struct); if (copybytes > sizeof(kdata)) return -EFAULT; if (copy_from_user(&kdata, data, copybytes)) return -EFAULT; effective = mk_kernel_cap(kdata[0].effective, kdata[1].effective); permitted = mk_kernel_cap(kdata[0].permitted, kdata[1].permitted); inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable); new = prepare_creds(); if (!new) return -ENOMEM; ret = security_capset(new, current_cred(), &effective, &inheritable, &permitted); if (ret < 0) goto error; audit_log_capset(new, current_cred()); return commit_creds(new); error: abort_creds(new); return ret; } /** * has_ns_capability - Does a task have a capability in a specific user ns * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE); rcu_read_unlock(); return (ret == 0); } /** * has_capability - Does a task have a capability in init_user_ns * @t: The task in question * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the initial user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_capability(struct task_struct *t, int cap) { return has_ns_capability(t, &init_user_ns, cap); } EXPORT_SYMBOL(has_capability); /** * has_ns_capability_noaudit - Does a task have a capability (unaudited) * in a specific user ns. * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * Do not write an audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); } /** * has_capability_noaudit - Does a task have a capability (unaudited) in the * initial user ns * @t: The task in question * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to init_user_ns, false if not. Don't write an * audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_capability_noaudit(struct task_struct *t, int cap) { return has_ns_capability_noaudit(t, &init_user_ns, cap); } EXPORT_SYMBOL(has_capability_noaudit); static bool ns_capable_common(struct user_namespace *ns, int cap, unsigned int opts) { int capable; if (unlikely(!cap_valid(cap))) { pr_crit("capable() called with invalid cap=%u\n", cap); BUG(); } capable = security_capable(current_cred(), ns, cap, opts); if (capable == 0) { current->flags |= PF_SUPERPRIV; return true; } return false; } /** * ns_capable - Determine if the current task has a superior capability in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NONE); } EXPORT_SYMBOL(ns_capable); /** * ns_capable_noaudit - Determine if the current task has a superior capability * (unaudited) in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT); } EXPORT_SYMBOL(ns_capable_noaudit); /** * ns_capable_setid - Determine if the current task has a superior capability * in effect, while signalling that this check is being done from within a * setid or setgroups syscall. * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_setid(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_INSETID); } EXPORT_SYMBOL(ns_capable_setid); /** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool capable(int cap) { return ns_capable(&init_user_ns, cap); } EXPORT_SYMBOL(capable); #endif /* CONFIG_MULTIUSER */ /** * file_ns_capable - Determine if the file's opener had a capability in effect * @file: The file we want to check * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if task that opened the file had a capability in effect * when the file was opened. * * This does not set PF_SUPERPRIV because the caller may not * actually be privileged. */ bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) { if (WARN_ON_ONCE(!cap_valid(cap))) return false; if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0) return true; return false; } EXPORT_SYMBOL(file_ns_capable); /** * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode? * @ns: The user namespace in question * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * * Return true if the inode uid and gid are within the namespace. */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, struct mnt_idmap *idmap, const struct inode *inode) { return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) && vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode)); } /** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @idmap: idmap of the mount @inode was found from * @inode: The inode in question * @cap: The capability in question * * Return true if the current task has the given capability targeted at * its own user namespace and that the given inode's uid and gid are * mapped into the current user namespace. */ bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap, const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, idmap, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); /** * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace * @tsk: The task that may be ptraced * @ns: The user namespace to search for CAP_SYS_PTRACE in * * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE * in the specified user namespace. */ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) { int ret = 0; /* An absent tracer adds no restrictions */ const struct cred *cred; rcu_read_lock(); cred = rcu_dereference(tsk->ptracer_cred); if (cred) ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); }
326 1922 3360 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PID_H #define _LINUX_PID_H #include <linux/rculist.h> #include <linux/wait.h> #include <linux/refcount.h> enum pid_type { PIDTYPE_PID, PIDTYPE_TGID, PIDTYPE_PGID, PIDTYPE_SID, PIDTYPE_MAX, }; /* * What is struct pid? * * A struct pid is the kernel's internal notion of a process identifier. * It refers to individual tasks, process groups, and sessions. While * there are processes attached to it the struct pid lives in a hash * table, so it and then the processes that it refers to can be found * quickly from the numeric pid value. The attached processes may be * quickly accessed by following pointers from struct pid. * * Storing pid_t values in the kernel and referring to them later has a * problem. The process originally with that pid may have exited and the * pid allocator wrapped, and another process could have come along * and been assigned that pid. * * Referring to user space processes by holding a reference to struct * task_struct has a problem. When the user space process exits * the now useless task_struct is still kept. A task_struct plus a * stack consumes around 10K of low kernel memory. More precisely * this is THREAD_SIZE + sizeof(struct task_struct). By comparison * a struct pid is about 64 bytes. * * Holding a reference to struct pid solves both of these problems. * It is small so holding a reference does not consume a lot of * resources, and since a new struct pid is allocated when the numeric pid * value is reused (when pids wrap around) we don't mistakenly refer to new * processes. */ /* * struct upid is used to get the id of the struct pid, as it is * seen in particular namespace. Later the struct pid is found with * find_pid_ns() using the int nr and struct pid_namespace *ns. */ struct upid { int nr; struct pid_namespace *ns; }; struct pid { refcount_t count; unsigned int level; spinlock_t lock; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; /* wait queue for pidfd notifications */ wait_queue_head_t wait_pidfd; struct rcu_head rcu; struct upid numbers[]; }; extern struct pid init_struct_pid; extern const struct file_operations pidfd_fops; struct file; extern struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); int pidfd_create(struct pid *pid, unsigned int flags); int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret); static inline struct pid *get_pid(struct pid *pid) { if (pid) refcount_inc(&pid->count); return pid; } extern void put_pid(struct pid *pid); extern struct task_struct *pid_task(struct pid *pid, enum pid_type); static inline bool pid_has_task(struct pid *pid, enum pid_type type) { return !hlist_empty(&pid->tasks[type]); } extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type); extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type); /* * these helpers must be called with the tasklist_lock write-held. */ extern void attach_pid(struct task_struct *task, enum pid_type); extern void detach_pid(struct task_struct *task, enum pid_type); extern void change_pid(struct task_struct *task, enum pid_type, struct pid *pid); extern void exchange_tids(struct task_struct *task, struct task_struct *old); extern void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type); struct pid_namespace; extern struct pid_namespace init_pid_ns; extern int pid_max; extern int pid_max_min, pid_max_max; /* * look up a PID in the hash table. Must be called with the tasklist_lock * or rcu_read_lock() held. * * find_pid_ns() finds the pid in the namespace specified * find_vpid() finds the pid by its virtual id, i.e. in the current namespace * * see also find_task_by_vpid() set in include/linux/sched.h */ extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns); extern struct pid *find_vpid(int nr); /* * Lookup a PID in the hash table, and return with it's count elevated. */ extern struct pid *find_get_pid(int nr); extern struct pid *find_ge_pid(int nr, struct pid_namespace *); extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size); extern void free_pid(struct pid *pid); extern void disable_pid_allocation(struct pid_namespace *ns); /* * ns_of_pid() returns the pid namespace in which the specified pid was * allocated. * * NOTE: * ns_of_pid() is expected to be called for a process (task) that has * an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid * is expected to be non-NULL. If @pid is NULL, caller should handle * the resulting NULL pid-ns. */ static inline struct pid_namespace *ns_of_pid(struct pid *pid) { struct pid_namespace *ns = NULL; if (pid) ns = pid->numbers[pid->level].ns; return ns; } /* * is_child_reaper returns true if the pid is the init process * of the current namespace. As this one could be checked before * pid_ns->child_reaper is assigned in copy_process, we check * with the pid number. */ static inline bool is_child_reaper(struct pid *pid) { return pid->numbers[pid->level].nr == 1; } /* * the helpers to get the pid's id seen from different namespaces * * pid_nr() : global id, i.e. the id seen from the init namespace; * pid_vnr() : virtual id, i.e. the id seen from the pid namespace of * current. * pid_nr_ns() : id seen from the ns specified. * * see also task_xid_nr() etc in include/linux/sched.h */ static inline pid_t pid_nr(struct pid *pid) { pid_t nr = 0; if (pid) nr = pid->numbers[0].nr; return nr; } pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns); pid_t pid_vnr(struct pid *pid); #define do_each_pid_task(pid, type, task) \ do { \ if ((pid) != NULL) \ hlist_for_each_entry_rcu((task), \ &(pid)->tasks[type], pid_links[type]) { /* * Both old and new leaders may be attached to * the same pid in the middle of de_thread(). */ #define while_each_pid_task(pid, type, task) \ if (type == PIDTYPE_PID) \ break; \ } \ } while (0) #define do_each_pid_thread(pid, type, task) \ do_each_pid_task(pid, type, task) { \ struct task_struct *tg___ = task; \ for_each_thread(tg___, task) { #define while_each_pid_thread(pid, type, task) \ } \ task = tg___; \ } while_each_pid_task(pid, type, task) #endif /* _LINUX_PID_H */
1271 1271 1271 9823 9822 4862 1271 1271 1271 11551 1271 1271 1271 4862 4862 1503 1502 1505 1504 2408 2408 2408 2298 2298 276 276 22 22 22 2296 2296 21 21 276 276 1074 1074 5324 5324 5324 1196 1196 1196 2884 1352 4352 4352 9822 9821 2070 2068 2068 5900 8 1 58 4551 4551 1196 1271 1271 1271 1271 390 389 390 4527 4527 119 396 263 8065 4379 987 147 147 147 146 40 2302 2103 980 3831 1044 997 1216 1333 3300 686 1011 270 884 7 900 4 907 45 555 161 470 381 381 21 76 7504 581 2672 578 4399 1336 35 2197 1755 226 408 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 // SPDX-License-Identifier: GPL-2.0-or-later /* * Security plug functions * * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com> * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com> * Copyright (C) 2016 Mellanox Technologies * Copyright (C) 2023 Microsoft Corporation <paul@paul-moore.com> */ #define pr_fmt(fmt) "LSM: " fmt #include <linux/bpf.h> #include <linux/capability.h> #include <linux/dcache.h> #include <linux/export.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kernel_read_file.h> #include <linux/lsm_hooks.h> #include <linux/integrity.h> #include <linux/ima.h> #include <linux/evm.h> #include <linux/fsnotify.h> #include <linux/mman.h> #include <linux/mount.h> #include <linux/personality.h> #include <linux/backing-dev.h> #include <linux/string.h> #include <linux/msg.h> #include <net/flow.h> /* How many LSMs were built into the kernel? */ #define LSM_COUNT (__end_lsm_info - __start_lsm_info) /* * These are descriptions of the reasons that can be passed to the * security_locked_down() LSM hook. Placing this array here allows * all security modules to use the same descriptions for auditing * purposes. */ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = { [LOCKDOWN_NONE] = "none", [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading", [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port", [LOCKDOWN_EFI_TEST] = "/dev/efi_test access", [LOCKDOWN_KEXEC] = "kexec of unsigned images", [LOCKDOWN_HIBERNATION] = "hibernation", [LOCKDOWN_PCI_ACCESS] = "direct PCI access", [LOCKDOWN_IOPORT] = "raw io port access", [LOCKDOWN_MSR] = "raw MSR access", [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables", [LOCKDOWN_DEVICE_TREE] = "modifying device tree contents", [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage", [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO", [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters", [LOCKDOWN_MMIOTRACE] = "unsafe mmio", [LOCKDOWN_DEBUGFS] = "debugfs access", [LOCKDOWN_XMON_WR] = "xmon write access", [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM", [LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM", [LOCKDOWN_RTAS_ERROR_INJECTION] = "RTAS error injection", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", [LOCKDOWN_BPF_READ_KERNEL] = "use of bpf to read kernel RAM", [LOCKDOWN_DBG_READ_KERNEL] = "use of kgdb/kdb to read kernel RAM", [LOCKDOWN_PERF] = "unsafe use of perf", [LOCKDOWN_TRACEFS] = "use of tracefs", [LOCKDOWN_XMON_RW] = "xmon read and write access", [LOCKDOWN_XFRM_SECRET] = "xfrm SA secret", [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality", }; struct security_hook_heads security_hook_heads __ro_after_init; static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain); static struct kmem_cache *lsm_file_cache; static struct kmem_cache *lsm_inode_cache; char *lsm_names; static struct lsm_blob_sizes blob_sizes __ro_after_init; /* Boot-time LSM user choice */ static __initdata const char *chosen_lsm_order; static __initdata const char *chosen_major_lsm; static __initconst const char *const builtin_lsm_order = CONFIG_LSM; /* Ordered list of LSMs to initialize. */ static __initdata struct lsm_info **ordered_lsms; static __initdata struct lsm_info *exclusive; static __initdata bool debug; #define init_debug(...) \ do { \ if (debug) \ pr_info(__VA_ARGS__); \ } while (0) static bool __init is_enabled(struct lsm_info *lsm) { if (!lsm->enabled) return false; return *lsm->enabled; } /* Mark an LSM's enabled flag. */ static int lsm_enabled_true __initdata = 1; static int lsm_enabled_false __initdata = 0; static void __init set_enabled(struct lsm_info *lsm, bool enabled) { /* * When an LSM hasn't configured an enable variable, we can use * a hard-coded location for storing the default enabled state. */ if (!lsm->enabled) { if (enabled) lsm->enabled = &lsm_enabled_true; else lsm->enabled = &lsm_enabled_false; } else if (lsm->enabled == &lsm_enabled_true) { if (!enabled) lsm->enabled = &lsm_enabled_false; } else if (lsm->enabled == &lsm_enabled_false) { if (enabled) lsm->enabled = &lsm_enabled_true; } else { *lsm->enabled = enabled; } } /* Is an LSM already listed in the ordered LSMs list? */ static bool __init exists_ordered_lsm(struct lsm_info *lsm) { struct lsm_info **check; for (check = ordered_lsms; *check; check++) if (*check == lsm) return true; return false; } /* Append an LSM to the list of ordered LSMs to initialize. */ static int last_lsm __initdata; static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from) { /* Ignore duplicate selections. */ if (exists_ordered_lsm(lsm)) return; if (WARN(last_lsm == LSM_COUNT, "%s: out of LSM slots!?\n", from)) return; /* Enable this LSM, if it is not already set. */ if (!lsm->enabled) lsm->enabled = &lsm_enabled_true; ordered_lsms[last_lsm++] = lsm; init_debug("%s ordered: %s (%s)\n", from, lsm->name, is_enabled(lsm) ? "enabled" : "disabled"); } /* Is an LSM allowed to be initialized? */ static bool __init lsm_allowed(struct lsm_info *lsm) { /* Skip if the LSM is disabled. */ if (!is_enabled(lsm)) return false; /* Not allowed if another exclusive LSM already initialized. */ if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) { init_debug("exclusive disabled: %s\n", lsm->name); return false; } return true; } static void __init lsm_set_blob_size(int *need, int *lbs) { int offset; if (*need <= 0) return; offset = ALIGN(*lbs, sizeof(void *)); *lbs = offset + *need; *need = offset; } static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed) { if (!needed) return; lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred); lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file); /* * The inode blob gets an rcu_head in addition to * what the modules might need. */ if (needed->lbs_inode && blob_sizes.lbs_inode == 0) blob_sizes.lbs_inode = sizeof(struct rcu_head); lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode); lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc); lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg); lsm_set_blob_size(&needed->lbs_superblock, &blob_sizes.lbs_superblock); lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task); lsm_set_blob_size(&needed->lbs_xattr_count, &blob_sizes.lbs_xattr_count); } /* Prepare LSM for initialization. */ static void __init prepare_lsm(struct lsm_info *lsm) { int enabled = lsm_allowed(lsm); /* Record enablement (to handle any following exclusive LSMs). */ set_enabled(lsm, enabled); /* If enabled, do pre-initialization work. */ if (enabled) { if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) { exclusive = lsm; init_debug("exclusive chosen: %s\n", lsm->name); } lsm_set_blob_sizes(lsm->blobs); } } /* Initialize a given LSM, if it is enabled. */ static void __init initialize_lsm(struct lsm_info *lsm) { if (is_enabled(lsm)) { int ret; init_debug("initializing %s\n", lsm->name); ret = lsm->init(); WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret); } } /* Populate ordered LSMs list from comma-separated LSM name list. */ static void __init ordered_lsm_parse(const char *order, const char *origin) { struct lsm_info *lsm; char *sep, *name, *next; /* LSM_ORDER_FIRST is always first. */ for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (lsm->order == LSM_ORDER_FIRST) append_ordered_lsm(lsm, " first"); } /* Process "security=", if given. */ if (chosen_major_lsm) { struct lsm_info *major; /* * To match the original "security=" behavior, this * explicitly does NOT fallback to another Legacy Major * if the selected one was separately disabled: disable * all non-matching Legacy Major LSMs. */ for (major = __start_lsm_info; major < __end_lsm_info; major++) { if ((major->flags & LSM_FLAG_LEGACY_MAJOR) && strcmp(major->name, chosen_major_lsm) != 0) { set_enabled(major, false); init_debug("security=%s disabled: %s (only one legacy major LSM)\n", chosen_major_lsm, major->name); } } } sep = kstrdup(order, GFP_KERNEL); next = sep; /* Walk the list, looking for matching LSMs. */ while ((name = strsep(&next, ",")) != NULL) { bool found = false; for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (strcmp(lsm->name, name) == 0) { if (lsm->order == LSM_ORDER_MUTABLE) append_ordered_lsm(lsm, origin); found = true; } } if (!found) init_debug("%s ignored: %s (not built into kernel)\n", origin, name); } /* Process "security=", if given. */ if (chosen_major_lsm) { for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (exists_ordered_lsm(lsm)) continue; if (strcmp(lsm->name, chosen_major_lsm) == 0) append_ordered_lsm(lsm, "security="); } } /* LSM_ORDER_LAST is always last. */ for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (lsm->order == LSM_ORDER_LAST) append_ordered_lsm(lsm, " last"); } /* Disable all LSMs not in the ordered list. */ for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) { if (exists_ordered_lsm(lsm)) continue; set_enabled(lsm, false); init_debug("%s skipped: %s (not in requested order)\n", origin, lsm->name); } kfree(sep); } static void __init lsm_early_cred(struct cred *cred); static void __init lsm_early_task(struct task_struct *task); static int lsm_append(const char *new, char **result); static void __init report_lsm_order(void) { struct lsm_info **lsm, *early; int first = 0; pr_info("initializing lsm="); /* Report each enabled LSM name, comma separated. */ for (early = __start_early_lsm_info; early < __end_early_lsm_info; early++) if (is_enabled(early)) pr_cont("%s%s", first++ == 0 ? "" : ",", early->name); for (lsm = ordered_lsms; *lsm; lsm++) if (is_enabled(*lsm)) pr_cont("%s%s", first++ == 0 ? "" : ",", (*lsm)->name); pr_cont("\n"); } static void __init ordered_lsm_init(void) { struct lsm_info **lsm; ordered_lsms = kcalloc(LSM_COUNT + 1, sizeof(*ordered_lsms), GFP_KERNEL); if (chosen_lsm_order) { if (chosen_major_lsm) { pr_warn("security=%s is ignored because it is superseded by lsm=%s\n", chosen_major_lsm, chosen_lsm_order); chosen_major_lsm = NULL; } ordered_lsm_parse(chosen_lsm_order, "cmdline"); } else ordered_lsm_parse(builtin_lsm_order, "builtin"); for (lsm = ordered_lsms; *lsm; lsm++) prepare_lsm(*lsm); report_lsm_order(); init_debug("cred blob size = %d\n", blob_sizes.lbs_cred); init_debug("file blob size = %d\n", blob_sizes.lbs_file); init_debug("inode blob size = %d\n", blob_sizes.lbs_inode); init_debug("ipc blob size = %d\n", blob_sizes.lbs_ipc); init_debug("msg_msg blob size = %d\n", blob_sizes.lbs_msg_msg); init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock); init_debug("task blob size = %d\n", blob_sizes.lbs_task); init_debug("xattr slots = %d\n", blob_sizes.lbs_xattr_count); /* * Create any kmem_caches needed for blobs */ if (blob_sizes.lbs_file) lsm_file_cache = kmem_cache_create("lsm_file_cache", blob_sizes.lbs_file, 0, SLAB_PANIC, NULL); if (blob_sizes.lbs_inode) lsm_inode_cache = kmem_cache_create("lsm_inode_cache", blob_sizes.lbs_inode, 0, SLAB_PANIC, NULL); lsm_early_cred((struct cred *) current->cred); lsm_early_task(current); for (lsm = ordered_lsms; *lsm; lsm++) initialize_lsm(*lsm); kfree(ordered_lsms); } int __init early_security_init(void) { struct lsm_info *lsm; #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ INIT_HLIST_HEAD(&security_hook_heads.NAME); #include "linux/lsm_hook_defs.h" #undef LSM_HOOK for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { if (!lsm->enabled) lsm->enabled = &lsm_enabled_true; prepare_lsm(lsm); initialize_lsm(lsm); } return 0; } /** * security_init - initializes the security framework * * This should be called early in the kernel initialization sequence. */ int __init security_init(void) { struct lsm_info *lsm; init_debug("legacy security=%s\n", chosen_major_lsm ? : " *unspecified*"); init_debug(" CONFIG_LSM=%s\n", builtin_lsm_order); init_debug("boot arg lsm=%s\n", chosen_lsm_order ? : " *unspecified*"); /* * Append the names of the early LSM modules now that kmalloc() is * available */ for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) { init_debug(" early started: %s (%s)\n", lsm->name, is_enabled(lsm) ? "enabled" : "disabled"); if (lsm->enabled) lsm_append(lsm->name, &lsm_names); } /* Load LSMs in specified order. */ ordered_lsm_init(); return 0; } /* Save user chosen LSM */ static int __init choose_major_lsm(char *str) { chosen_major_lsm = str; return 1; } __setup("security=", choose_major_lsm); /* Explicitly choose LSM initialization order. */ static int __init choose_lsm_order(char *str) { chosen_lsm_order = str; return 1; } __setup("lsm=", choose_lsm_order); /* Enable LSM order debugging. */ static int __init enable_debug(char *str) { debug = true; return 1; } __setup("lsm.debug", enable_debug); static bool match_last_lsm(const char *list, const char *lsm) { const char *last; if (WARN_ON(!list || !lsm)) return false; last = strrchr(list, ','); if (last) /* Pass the comma, strcmp() will check for '\0' */ last++; else last = list; return !strcmp(last, lsm); } static int lsm_append(const char *new, char **result) { char *cp; if (*result == NULL) { *result = kstrdup(new, GFP_KERNEL); if (*result == NULL) return -ENOMEM; } else { /* Check if it is the last registered name */ if (match_last_lsm(*result, new)) return 0; cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new); if (cp == NULL) return -ENOMEM; kfree(*result); *result = cp; } return 0; } /** * security_add_hooks - Add a modules hooks to the hook lists. * @hooks: the hooks to add * @count: the number of hooks to add * @lsm: the name of the security module * * Each LSM has to register its hooks with the infrastructure. */ void __init security_add_hooks(struct security_hook_list *hooks, int count, const char *lsm) { int i; for (i = 0; i < count; i++) { hooks[i].lsm = lsm; hlist_add_tail_rcu(&hooks[i].list, hooks[i].head); } /* * Don't try to append during early_security_init(), we'll come back * and fix this up afterwards. */ if (slab_is_available()) { if (lsm_append(lsm, &lsm_names) < 0) panic("%s - Cannot get early memory.\n", __func__); } } int call_blocking_lsm_notifier(enum lsm_event event, void *data) { return blocking_notifier_call_chain(&blocking_lsm_notifier_chain, event, data); } EXPORT_SYMBOL(call_blocking_lsm_notifier); int register_blocking_lsm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&blocking_lsm_notifier_chain, nb); } EXPORT_SYMBOL(register_blocking_lsm_notifier); int unregister_blocking_lsm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&blocking_lsm_notifier_chain, nb); } EXPORT_SYMBOL(unregister_blocking_lsm_notifier); /** * lsm_cred_alloc - allocate a composite cred blob * @cred: the cred that needs a blob * @gfp: allocation type * * Allocate the cred blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_cred_alloc(struct cred *cred, gfp_t gfp) { if (blob_sizes.lbs_cred == 0) { cred->security = NULL; return 0; } cred->security = kzalloc(blob_sizes.lbs_cred, gfp); if (cred->security == NULL) return -ENOMEM; return 0; } /** * lsm_early_cred - during initialization allocate a composite cred blob * @cred: the cred that needs a blob * * Allocate the cred blob for all the modules */ static void __init lsm_early_cred(struct cred *cred) { int rc = lsm_cred_alloc(cred, GFP_KERNEL); if (rc) panic("%s: Early cred alloc failed.\n", __func__); } /** * lsm_file_alloc - allocate a composite file blob * @file: the file that needs a blob * * Allocate the file blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_file_alloc(struct file *file) { if (!lsm_file_cache) { file->f_security = NULL; return 0; } file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL); if (file->f_security == NULL) return -ENOMEM; return 0; } /** * lsm_inode_alloc - allocate a composite inode blob * @inode: the inode that needs a blob * * Allocate the inode blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ int lsm_inode_alloc(struct inode *inode) { if (!lsm_inode_cache) { inode->i_security = NULL; return 0; } inode->i_security = kmem_cache_zalloc(lsm_inode_cache, GFP_NOFS); if (inode->i_security == NULL) return -ENOMEM; return 0; } /** * lsm_task_alloc - allocate a composite task blob * @task: the task that needs a blob * * Allocate the task blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_task_alloc(struct task_struct *task) { if (blob_sizes.lbs_task == 0) { task->security = NULL; return 0; } task->security = kzalloc(blob_sizes.lbs_task, GFP_KERNEL); if (task->security == NULL) return -ENOMEM; return 0; } /** * lsm_ipc_alloc - allocate a composite ipc blob * @kip: the ipc that needs a blob * * Allocate the ipc blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_ipc_alloc(struct kern_ipc_perm *kip) { if (blob_sizes.lbs_ipc == 0) { kip->security = NULL; return 0; } kip->security = kzalloc(blob_sizes.lbs_ipc, GFP_KERNEL); if (kip->security == NULL) return -ENOMEM; return 0; } /** * lsm_msg_msg_alloc - allocate a composite msg_msg blob * @mp: the msg_msg that needs a blob * * Allocate the ipc blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_msg_msg_alloc(struct msg_msg *mp) { if (blob_sizes.lbs_msg_msg == 0) { mp->security = NULL; return 0; } mp->security = kzalloc(blob_sizes.lbs_msg_msg, GFP_KERNEL); if (mp->security == NULL) return -ENOMEM; return 0; } /** * lsm_early_task - during initialization allocate a composite task blob * @task: the task that needs a blob * * Allocate the task blob for all the modules */ static void __init lsm_early_task(struct task_struct *task) { int rc = lsm_task_alloc(task); if (rc) panic("%s: Early task alloc failed.\n", __func__); } /** * lsm_superblock_alloc - allocate a composite superblock blob * @sb: the superblock that needs a blob * * Allocate the superblock blob for all the modules * * Returns 0, or -ENOMEM if memory can't be allocated. */ static int lsm_superblock_alloc(struct super_block *sb) { if (blob_sizes.lbs_superblock == 0) { sb->s_security = NULL; return 0; } sb->s_security = kzalloc(blob_sizes.lbs_superblock, GFP_KERNEL); if (sb->s_security == NULL) return -ENOMEM; return 0; } /* * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and * can be accessed with: * * LSM_RET_DEFAULT(<hook_name>) * * The macros below define static constants for the default value of each * LSM hook. */ #define LSM_RET_DEFAULT(NAME) (NAME##_default) #define DECLARE_LSM_RET_DEFAULT_void(DEFAULT, NAME) #define DECLARE_LSM_RET_DEFAULT_int(DEFAULT, NAME) \ static const int __maybe_unused LSM_RET_DEFAULT(NAME) = (DEFAULT); #define LSM_HOOK(RET, DEFAULT, NAME, ...) \ DECLARE_LSM_RET_DEFAULT_##RET(DEFAULT, NAME) #include <linux/lsm_hook_defs.h> #undef LSM_HOOK /* * Hook list operation macros. * * call_void_hook: * This is a hook that does not return a value. * * call_int_hook: * This is a hook that returns a value. */ #define call_void_hook(FUNC, ...) \ do { \ struct security_hook_list *P; \ \ hlist_for_each_entry(P, &security_hook_heads.FUNC, list) \ P->hook.FUNC(__VA_ARGS__); \ } while (0) #define call_int_hook(FUNC, IRC, ...) ({ \ int RC = IRC; \ do { \ struct security_hook_list *P; \ \ hlist_for_each_entry(P, &security_hook_heads.FUNC, list) { \ RC = P->hook.FUNC(__VA_ARGS__); \ if (RC != 0) \ break; \ } \ } while (0); \ RC; \ }) /* Security operations */ /** * security_binder_set_context_mgr() - Check if becoming binder ctx mgr is ok * @mgr: task credentials of current binder process * * Check whether @mgr is allowed to be the binder context manager. * * Return: Return 0 if permission is granted. */ int security_binder_set_context_mgr(const struct cred *mgr) { return call_int_hook(binder_set_context_mgr, 0, mgr); } /** * security_binder_transaction() - Check if a binder transaction is allowed * @from: sending process * @to: receiving process * * Check whether @from is allowed to invoke a binder transaction call to @to. * * Return: Returns 0 if permission is granted. */ int security_binder_transaction(const struct cred *from, const struct cred *to) { return call_int_hook(binder_transaction, 0, from, to); } /** * security_binder_transfer_binder() - Check if a binder transfer is allowed * @from: sending process * @to: receiving process * * Check whether @from is allowed to transfer a binder reference to @to. * * Return: Returns 0 if permission is granted. */ int security_binder_transfer_binder(const struct cred *from, const struct cred *to) { return call_int_hook(binder_transfer_binder, 0, from, to); } /** * security_binder_transfer_file() - Check if a binder file xfer is allowed * @from: sending process * @to: receiving process * @file: file being transferred * * Check whether @from is allowed to transfer @file to @to. * * Return: Returns 0 if permission is granted. */ int security_binder_transfer_file(const struct cred *from, const struct cred *to, const struct file *file) { return call_int_hook(binder_transfer_file, 0, from, to, file); } /** * security_ptrace_access_check() - Check if tracing is allowed * @child: target process * @mode: PTRACE_MODE flags * * Check permission before allowing the current process to trace the @child * process. Security modules may also want to perform a process tracing check * during an execve in the set_security or apply_creds hooks of tracing check * during an execve in the bprm_set_creds hook of binprm_security_ops if the * process is being traced and its security attributes would be changed by the * execve. * * Return: Returns 0 if permission is granted. */ int security_ptrace_access_check(struct task_struct *child, unsigned int mode) { return call_int_hook(ptrace_access_check, 0, child, mode); } /** * security_ptrace_traceme() - Check if tracing is allowed * @parent: tracing process * * Check that the @parent process has sufficient permission to trace the * current process before allowing the current process to present itself to the * @parent process for tracing. * * Return: Returns 0 if permission is granted. */ int security_ptrace_traceme(struct task_struct *parent) { return call_int_hook(ptrace_traceme, 0, parent); } /** * security_capget() - Get the capability sets for a process * @target: target process * @effective: effective capability set * @inheritable: inheritable capability set * @permitted: permitted capability set * * Get the @effective, @inheritable, and @permitted capability sets for the * @target process. The hook may also perform permission checking to determine * if the current process is allowed to see the capability sets of the @target * process. * * Return: Returns 0 if the capability sets were successfully obtained. */ int security_capget(const struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { return call_int_hook(capget, 0, target, effective, inheritable, permitted); } /** * security_capset() - Set the capability sets for a process * @new: new credentials for the target process * @old: current credentials of the target process * @effective: effective capability set * @inheritable: inheritable capability set * @permitted: permitted capability set * * Set the @effective, @inheritable, and @permitted capability sets for the * current process. * * Return: Returns 0 and update @new if permission is granted. */ int security_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { return call_int_hook(capset, 0, new, old, effective, inheritable, permitted); } /** * security_capable() - Check if a process has the necessary capability * @cred: credentials to examine * @ns: user namespace * @cap: capability requested * @opts: capability check options * * Check whether the @tsk process has the @cap capability in the indicated * credentials. @cap contains the capability <include/linux/capability.h>. * @opts contains options for the capable check <include/linux/security.h>. * * Return: Returns 0 if the capability is granted. */ int security_capable(const struct cred *cred, struct user_namespace *ns, int cap, unsigned int opts) { return call_int_hook(capable, 0, cred, ns, cap, opts); } /** * security_quotactl() - Check if a quotactl() syscall is allowed for this fs * @cmds: commands * @type: type * @id: id * @sb: filesystem * * Check whether the quotactl syscall is allowed for this @sb. * * Return: Returns 0 if permission is granted. */ int security_quotactl(int cmds, int type, int id, struct super_block *sb) { return call_int_hook(quotactl, 0, cmds, type, id, sb); } /** * security_quota_on() - Check if QUOTAON is allowed for a dentry * @dentry: dentry * * Check whether QUOTAON is allowed for @dentry. * * Return: Returns 0 if permission is granted. */ int security_quota_on(struct dentry *dentry) { return call_int_hook(quota_on, 0, dentry); } /** * security_syslog() - Check if accessing the kernel message ring is allowed * @type: SYSLOG_ACTION_* type * * Check permission before accessing the kernel message ring or changing * logging to the console. See the syslog(2) manual page for an explanation of * the @type values. * * Return: Return 0 if permission is granted. */ int security_syslog(int type) { return call_int_hook(syslog, 0, type); } /** * security_settime64() - Check if changing the system time is allowed * @ts: new time * @tz: timezone * * Check permission to change the system time, struct timespec64 is defined in * <include/linux/time64.h> and timezone is defined in <include/linux/time.h>. * * Return: Returns 0 if permission is granted. */ int security_settime64(const struct timespec64 *ts, const struct timezone *tz) { return call_int_hook(settime, 0, ts, tz); } /** * security_vm_enough_memory_mm() - Check if allocating a new mem map is allowed * @mm: mm struct * @pages: number of pages * * Check permissions for allocating a new virtual mapping. If all LSMs return * a positive value, __vm_enough_memory() will be called with cap_sys_admin * set. If at least one LSM returns 0 or negative, __vm_enough_memory() will be * called with cap_sys_admin cleared. * * Return: Returns 0 if permission is granted by the LSM infrastructure to the * caller. */ int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) { struct security_hook_list *hp; int cap_sys_admin = 1; int rc; /* * The module will respond with a positive value if * it thinks the __vm_enough_memory() call should be * made with the cap_sys_admin set. If all of the modules * agree that it should be set it will. If any module * thinks it should not be set it won't. */ hlist_for_each_entry(hp, &security_hook_heads.vm_enough_memory, list) { rc = hp->hook.vm_enough_memory(mm, pages); if (rc <= 0) { cap_sys_admin = 0; break; } } return __vm_enough_memory(mm, pages, cap_sys_admin); } /** * security_bprm_creds_for_exec() - Prepare the credentials for exec() * @bprm: binary program information * * If the setup in prepare_exec_creds did not setup @bprm->cred->security * properly for executing @bprm->file, update the LSM's portion of * @bprm->cred->security to be what commit_creds needs to install for the new * program. This hook may also optionally check permissions (e.g. for * transitions between security domains). The hook must set @bprm->secureexec * to 1 if AT_SECURE should be set to request libc enable secure mode. @bprm * contains the linux_binprm structure. * * Return: Returns 0 if the hook is successful and permission is granted. */ int security_bprm_creds_for_exec(struct linux_binprm *bprm) { return call_int_hook(bprm_creds_for_exec, 0, bprm); } /** * security_bprm_creds_from_file() - Update linux_binprm creds based on file * @bprm: binary program information * @file: associated file * * If @file is setpcap, suid, sgid or otherwise marked to change privilege upon * exec, update @bprm->cred to reflect that change. This is called after * finding the binary that will be executed without an interpreter. This * ensures that the credentials will not be derived from a script that the * binary will need to reopen, which when reopend may end up being a completely * different file. This hook may also optionally check permissions (e.g. for * transitions between security domains). The hook must set @bprm->secureexec * to 1 if AT_SECURE should be set to request libc enable secure mode. The * hook must add to @bprm->per_clear any personality flags that should be * cleared from current->personality. @bprm contains the linux_binprm * structure. * * Return: Returns 0 if the hook is successful and permission is granted. */ int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file) { return call_int_hook(bprm_creds_from_file, 0, bprm, file); } /** * security_bprm_check() - Mediate binary handler search * @bprm: binary program information * * This hook mediates the point when a search for a binary handler will begin. * It allows a check against the @bprm->cred->security value which was set in * the preceding creds_for_exec call. The argv list and envp list are reliably * available in @bprm. This hook may be called multiple times during a single * execve. @bprm contains the linux_binprm structure. * * Return: Returns 0 if the hook is successful and permission is granted. */ int security_bprm_check(struct linux_binprm *bprm) { int ret; ret = call_int_hook(bprm_check_security, 0, bprm); if (ret) return ret; return ima_bprm_check(bprm); } /** * security_bprm_committing_creds() - Install creds for a process during exec() * @bprm: binary program information * * Prepare to install the new security attributes of a process being * transformed by an execve operation, based on the old credentials pointed to * by @current->cred and the information set in @bprm->cred by the * bprm_creds_for_exec hook. @bprm points to the linux_binprm structure. This * hook is a good place to perform state changes on the process such as closing * open file descriptors to which access will no longer be granted when the * attributes are changed. This is called immediately before commit_creds(). */ void security_bprm_committing_creds(struct linux_binprm *bprm) { call_void_hook(bprm_committing_creds, bprm); } /** * security_bprm_committed_creds() - Tidy up after cred install during exec() * @bprm: binary program information * * Tidy up after the installation of the new security attributes of a process * being transformed by an execve operation. The new credentials have, by this * point, been set to @current->cred. @bprm points to the linux_binprm * structure. This hook is a good place to perform state changes on the * process such as clearing out non-inheritable signal state. This is called * immediately after commit_creds(). */ void security_bprm_committed_creds(struct linux_binprm *bprm) { call_void_hook(bprm_committed_creds, bprm); } /** * security_fs_context_submount() - Initialise fc->security * @fc: new filesystem context * @reference: dentry reference for submount/remount * * Fill out the ->security field for a new fs_context. * * Return: Returns 0 on success or negative error code on failure. */ int security_fs_context_submount(struct fs_context *fc, struct super_block *reference) { return call_int_hook(fs_context_submount, 0, fc, reference); } /** * security_fs_context_dup() - Duplicate a fs_context LSM blob * @fc: destination filesystem context * @src_fc: source filesystem context * * Allocate and attach a security structure to sc->security. This pointer is * initialised to NULL by the caller. @fc indicates the new filesystem context. * @src_fc indicates the original filesystem context. * * Return: Returns 0 on success or a negative error code on failure. */ int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { return call_int_hook(fs_context_dup, 0, fc, src_fc); } /** * security_fs_context_parse_param() - Configure a filesystem context * @fc: filesystem context * @param: filesystem parameter * * Userspace provided a parameter to configure a superblock. The LSM can * consume the parameter or return it to the caller for use elsewhere. * * Return: If the parameter is used by the LSM it should return 0, if it is * returned to the caller -ENOPARAM is returned, otherwise a negative * error code is returned. */ int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct security_hook_list *hp; int trc; int rc = -ENOPARAM; hlist_for_each_entry(hp, &security_hook_heads.fs_context_parse_param, list) { trc = hp->hook.fs_context_parse_param(fc, param); if (trc == 0) rc = 0; else if (trc != -ENOPARAM) return trc; } return rc; } /** * security_sb_alloc() - Allocate a super_block LSM blob * @sb: filesystem superblock * * Allocate and attach a security structure to the sb->s_security field. The * s_security field is initialized to NULL when the structure is allocated. * @sb contains the super_block structure to be modified. * * Return: Returns 0 if operation was successful. */ int security_sb_alloc(struct super_block *sb) { int rc = lsm_superblock_alloc(sb); if (unlikely(rc)) return rc; rc = call_int_hook(sb_alloc_security, 0, sb); if (unlikely(rc)) security_sb_free(sb); return rc; } /** * security_sb_delete() - Release super_block LSM associated objects * @sb: filesystem superblock * * Release objects tied to a superblock (e.g. inodes). @sb contains the * super_block structure being released. */ void security_sb_delete(struct super_block *sb) { call_void_hook(sb_delete, sb); } /** * security_sb_free() - Free a super_block LSM blob * @sb: filesystem superblock * * Deallocate and clear the sb->s_security field. @sb contains the super_block * structure to be modified. */ void security_sb_free(struct super_block *sb) { call_void_hook(sb_free_security, sb); kfree(sb->s_security); sb->s_security = NULL; } /** * security_free_mnt_opts() - Free memory associated with mount options * @mnt_opts: LSM processed mount options * * Free memory associated with @mnt_ops. */ void security_free_mnt_opts(void **mnt_opts) { if (!*mnt_opts) return; call_void_hook(sb_free_mnt_opts, *mnt_opts); *mnt_opts = NULL; } EXPORT_SYMBOL(security_free_mnt_opts); /** * security_sb_eat_lsm_opts() - Consume LSM mount options * @options: mount options * @mnt_opts: LSM processed mount options * * Eat (scan @options) and save them in @mnt_opts. * * Return: Returns 0 on success, negative values on failure. */ int security_sb_eat_lsm_opts(char *options, void **mnt_opts) { return call_int_hook(sb_eat_lsm_opts, 0, options, mnt_opts); } EXPORT_SYMBOL(security_sb_eat_lsm_opts); /** * security_sb_mnt_opts_compat() - Check if new mount options are allowed * @sb: filesystem superblock * @mnt_opts: new mount options * * Determine if the new mount options in @mnt_opts are allowed given the * existing mounted filesystem at @sb. @sb superblock being compared. * * Return: Returns 0 if options are compatible. */ int security_sb_mnt_opts_compat(struct super_block *sb, void *mnt_opts) { return call_int_hook(sb_mnt_opts_compat, 0, sb, mnt_opts); } EXPORT_SYMBOL(security_sb_mnt_opts_compat); /** * security_sb_remount() - Verify no incompatible mount changes during remount * @sb: filesystem superblock * @mnt_opts: (re)mount options * * Extracts security system specific mount options and verifies no changes are * being made to those options. * * Return: Returns 0 if permission is granted. */ int security_sb_remount(struct super_block *sb, void *mnt_opts) { return call_int_hook(sb_remount, 0, sb, mnt_opts); } EXPORT_SYMBOL(security_sb_remount); /** * security_sb_kern_mount() - Check if a kernel mount is allowed * @sb: filesystem superblock * * Mount this @sb if allowed by permissions. * * Return: Returns 0 if permission is granted. */ int security_sb_kern_mount(struct super_block *sb) { return call_int_hook(sb_kern_mount, 0, sb); } /** * security_sb_show_options() - Output the mount options for a superblock * @m: output file * @sb: filesystem superblock * * Show (print on @m) mount options for this @sb. * * Return: Returns 0 on success, negative values on failure. */ int security_sb_show_options(struct seq_file *m, struct super_block *sb) { return call_int_hook(sb_show_options, 0, m, sb); } /** * security_sb_statfs() - Check if accessing fs stats is allowed * @dentry: superblock handle * * Check permission before obtaining filesystem statistics for the @mnt * mountpoint. @dentry is a handle on the superblock for the filesystem. * * Return: Returns 0 if permission is granted. */ int security_sb_statfs(struct dentry *dentry) { return call_int_hook(sb_statfs, 0, dentry); } /** * security_sb_mount() - Check permission for mounting a filesystem * @dev_name: filesystem backing device * @path: mount point * @type: filesystem type * @flags: mount flags * @data: filesystem specific data * * Check permission before an object specified by @dev_name is mounted on the * mount point named by @nd. For an ordinary mount, @dev_name identifies a * device if the file system type requires a device. For a remount * (@flags & MS_REMOUNT), @dev_name is irrelevant. For a loopback/bind mount * (@flags & MS_BIND), @dev_name identifies the pathname of the object being * mounted. * * Return: Returns 0 if permission is granted. */ int security_sb_mount(const char *dev_name, const struct path *path, const char *type, unsigned long flags, void *data) { return call_int_hook(sb_mount, 0, dev_name, path, type, flags, data); } /** * security_sb_umount() - Check permission for unmounting a filesystem * @mnt: mounted filesystem * @flags: unmount flags * * Check permission before the @mnt file system is unmounted. * * Return: Returns 0 if permission is granted. */ int security_sb_umount(struct vfsmount *mnt, int flags) { return call_int_hook(sb_umount, 0, mnt, flags); } /** * security_sb_pivotroot() - Check permissions for pivoting the rootfs * @old_path: new location for current rootfs * @new_path: location of the new rootfs * * Check permission before pivoting the root filesystem. * * Return: Returns 0 if permission is granted. */ int security_sb_pivotroot(const struct path *old_path, const struct path *new_path) { return call_int_hook(sb_pivotroot, 0, old_path, new_path); } /** * security_sb_set_mnt_opts() - Set the mount options for a filesystem * @sb: filesystem superblock * @mnt_opts: binary mount options * @kern_flags: kernel flags (in) * @set_kern_flags: kernel flags (out) * * Set the security relevant mount options used for a superblock. * * Return: Returns 0 on success, error on failure. */ int security_sb_set_mnt_opts(struct super_block *sb, void *mnt_opts, unsigned long kern_flags, unsigned long *set_kern_flags) { return call_int_hook(sb_set_mnt_opts, mnt_opts ? -EOPNOTSUPP : 0, sb, mnt_opts, kern_flags, set_kern_flags); } EXPORT_SYMBOL(security_sb_set_mnt_opts); /** * security_sb_clone_mnt_opts() - Duplicate superblock mount options * @oldsb: source superblock * @newsb: destination superblock * @kern_flags: kernel flags (in) * @set_kern_flags: kernel flags (out) * * Copy all security options from a given superblock to another. * * Return: Returns 0 on success, error on failure. */ int security_sb_clone_mnt_opts(const struct super_block *oldsb, struct super_block *newsb, unsigned long kern_flags, unsigned long *set_kern_flags) { return call_int_hook(sb_clone_mnt_opts, 0, oldsb, newsb, kern_flags, set_kern_flags); } EXPORT_SYMBOL(security_sb_clone_mnt_opts); /** * security_move_mount() - Check permissions for moving a mount * @from_path: source mount point * @to_path: destination mount point * * Check permission before a mount is moved. * * Return: Returns 0 if permission is granted. */ int security_move_mount(const struct path *from_path, const struct path *to_path) { return call_int_hook(move_mount, 0, from_path, to_path); } /** * security_path_notify() - Check if setting a watch is allowed * @path: file path * @mask: event mask * @obj_type: file path type * * Check permissions before setting a watch on events as defined by @mask, on * an object at @path, whose type is defined by @obj_type. * * Return: Returns 0 if permission is granted. */ int security_path_notify(const struct path *path, u64 mask, unsigned int obj_type) { return call_int_hook(path_notify, 0, path, mask, obj_type); } /** * security_inode_alloc() - Allocate an inode LSM blob * @inode: the inode * * Allocate and attach a security structure to @inode->i_security. The * i_security field is initialized to NULL when the inode structure is * allocated. * * Return: Return 0 if operation was successful. */ int security_inode_alloc(struct inode *inode) { int rc = lsm_inode_alloc(inode); if (unlikely(rc)) return rc; rc = call_int_hook(inode_alloc_security, 0, inode); if (unlikely(rc)) security_inode_free(inode); return rc; } static void inode_free_by_rcu(struct rcu_head *head) { /* * The rcu head is at the start of the inode blob */ kmem_cache_free(lsm_inode_cache, head); } /** * security_inode_free() - Free an inode's LSM blob * @inode: the inode * * Deallocate the inode security structure and set @inode->i_security to NULL. */ void security_inode_free(struct inode *inode) { integrity_inode_free(inode); call_void_hook(inode_free_security, inode); /* * The inode may still be referenced in a path walk and * a call to security_inode_permission() can be made * after inode_free_security() is called. Ideally, the VFS * wouldn't do this, but fixing that is a much harder * job. For now, simply free the i_security via RCU, and * leave the current inode->i_security pointer intact. * The inode will be freed after the RCU grace period too. */ if (inode->i_security) call_rcu((struct rcu_head *)inode->i_security, inode_free_by_rcu); } /** * security_dentry_init_security() - Perform dentry initialization * @dentry: the dentry to initialize * @mode: mode used to determine resource type * @name: name of the last path component * @xattr_name: name of the security/LSM xattr * @ctx: pointer to the resulting LSM context * @ctxlen: length of @ctx * * Compute a context for a dentry as the inode is not yet available since NFSv4 * has no label backed by an EA anyway. It is important to note that * @xattr_name does not need to be free'd by the caller, it is a static string. * * Return: Returns 0 on success, negative values on failure. */ int security_dentry_init_security(struct dentry *dentry, int mode, const struct qstr *name, const char **xattr_name, void **ctx, u32 *ctxlen) { struct security_hook_list *hp; int rc; /* * Only one module will provide a security context. */ hlist_for_each_entry(hp, &security_hook_heads.dentry_init_security, list) { rc = hp->hook.dentry_init_security(dentry, mode, name, xattr_name, ctx, ctxlen); if (rc != LSM_RET_DEFAULT(dentry_init_security)) return rc; } return LSM_RET_DEFAULT(dentry_init_security); } EXPORT_SYMBOL(security_dentry_init_security); /** * security_dentry_create_files_as() - Perform dentry initialization * @dentry: the dentry to initialize * @mode: mode used to determine resource type * @name: name of the last path component * @old: creds to use for LSM context calculations * @new: creds to modify * * Compute a context for a dentry as the inode is not yet available and set * that context in passed in creds so that new files are created using that * context. Context is calculated using the passed in creds and not the creds * of the caller. * * Return: Returns 0 on success, error on failure. */ int security_dentry_create_files_as(struct dentry *dentry, int mode, struct qstr *name, const struct cred *old, struct cred *new) { return call_int_hook(dentry_create_files_as, 0, dentry, mode, name, old, new); } EXPORT_SYMBOL(security_dentry_create_files_as); /** * security_inode_init_security() - Initialize an inode's LSM context * @inode: the inode * @dir: parent directory * @qstr: last component of the pathname * @initxattrs: callback function to write xattrs * @fs_data: filesystem specific data * * Obtain the security attribute name suffix and value to set on a newly * created inode and set up the incore security field for the new inode. This * hook is called by the fs code as part of the inode creation transaction and * provides for atomic labeling of the inode, unlike the post_create/mkdir/... * hooks called by the VFS. * * The hook function is expected to populate the xattrs array, by calling * lsm_get_xattr_slot() to retrieve the slots reserved by the security module * with the lbs_xattr_count field of the lsm_blob_sizes structure. For each * slot, the hook function should set ->name to the attribute name suffix * (e.g. selinux), to allocate ->value (will be freed by the caller) and set it * to the attribute value, to set ->value_len to the length of the value. If * the security module does not use security attributes or does not wish to put * a security attribute on this particular inode, then it should return * -EOPNOTSUPP to skip this processing. * * Return: Returns 0 if the LSM successfully initialized all of the inode * security attributes that are required, negative values otherwise. */ int security_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr, const initxattrs initxattrs, void *fs_data) { struct security_hook_list *hp; struct xattr *new_xattrs = NULL; int ret = -EOPNOTSUPP, xattr_count = 0; if (unlikely(IS_PRIVATE(inode))) return 0; if (!blob_sizes.lbs_xattr_count) return 0; if (initxattrs) { /* Allocate +1 for EVM and +1 as terminator. */ new_xattrs = kcalloc(blob_sizes.lbs_xattr_count + 2, sizeof(*new_xattrs), GFP_NOFS); if (!new_xattrs) return -ENOMEM; } hlist_for_each_entry(hp, &security_hook_heads.inode_init_security, list) { ret = hp->hook.inode_init_security(inode, dir, qstr, new_xattrs, &xattr_count); if (ret && ret != -EOPNOTSUPP) goto out; /* * As documented in lsm_hooks.h, -EOPNOTSUPP in this context * means that the LSM is not willing to provide an xattr, not * that it wants to signal an error. Thus, continue to invoke * the remaining LSMs. */ } /* If initxattrs() is NULL, xattr_count is zero, skip the call. */ if (!xattr_count) goto out; ret = evm_inode_init_security(inode, dir, qstr, new_xattrs, &xattr_count); if (ret) goto out; ret = initxattrs(inode, new_xattrs, fs_data); out: for (; xattr_count > 0; xattr_count--) kfree(new_xattrs[xattr_count - 1].value); kfree(new_xattrs); return (ret == -EOPNOTSUPP) ? 0 : ret; } EXPORT_SYMBOL(security_inode_init_security); /** * security_inode_init_security_anon() - Initialize an anonymous inode * @inode: the inode * @name: the anonymous inode class * @context_inode: an optional related inode * * Set up the incore security field for the new anonymous inode and return * whether the inode creation is permitted by the security module or not. * * Return: Returns 0 on success, -EACCES if the security module denies the * creation of this inode, or another -errno upon other errors. */ int security_inode_init_security_anon(struct inode *inode, const struct qstr *name, const struct inode *context_inode) { return call_int_hook(inode_init_security_anon, 0, inode, name, context_inode); } #ifdef CONFIG_SECURITY_PATH /** * security_path_mknod() - Check if creating a special file is allowed * @dir: parent directory * @dentry: new file * @mode: new file mode * @dev: device number * * Check permissions when creating a file. Note that this hook is called even * if mknod operation is being done for a regular file. * * Return: Returns 0 if permission is granted. */ int security_path_mknod(const struct path *dir, struct dentry *dentry, umode_t mode, unsigned int dev) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_mknod, 0, dir, dentry, mode, dev); } EXPORT_SYMBOL(security_path_mknod); /** * security_path_mkdir() - Check if creating a new directory is allowed * @dir: parent directory * @dentry: new directory * @mode: new directory mode * * Check permissions to create a new directory in the existing directory. * * Return: Returns 0 if permission is granted. */ int security_path_mkdir(const struct path *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_mkdir, 0, dir, dentry, mode); } EXPORT_SYMBOL(security_path_mkdir); /** * security_path_rmdir() - Check if removing a directory is allowed * @dir: parent directory * @dentry: directory to remove * * Check the permission to remove a directory. * * Return: Returns 0 if permission is granted. */ int security_path_rmdir(const struct path *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_rmdir, 0, dir, dentry); } /** * security_path_unlink() - Check if removing a hard link is allowed * @dir: parent directory * @dentry: file * * Check the permission to remove a hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_path_unlink(const struct path *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_unlink, 0, dir, dentry); } EXPORT_SYMBOL(security_path_unlink); /** * security_path_symlink() - Check if creating a symbolic link is allowed * @dir: parent directory * @dentry: symbolic link * @old_name: file pathname * * Check the permission to create a symbolic link to a file. * * Return: Returns 0 if permission is granted. */ int security_path_symlink(const struct path *dir, struct dentry *dentry, const char *old_name) { if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry)))) return 0; return call_int_hook(path_symlink, 0, dir, dentry, old_name); } /** * security_path_link - Check if creating a hard link is allowed * @old_dentry: existing file * @new_dir: new parent directory * @new_dentry: new link * * Check permission before creating a new hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_path_link(struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)))) return 0; return call_int_hook(path_link, 0, old_dentry, new_dir, new_dentry); } /** * security_path_rename() - Check if renaming a file is allowed * @old_dir: parent directory of the old file * @old_dentry: the old file * @new_dir: parent directory of the new file * @new_dentry: the new file * @flags: flags * * Check for permission to rename a file or directory. * * Return: Returns 0 if permission is granted. */ int security_path_rename(const struct path *old_dir, struct dentry *old_dentry, const struct path *new_dir, struct dentry *new_dentry, unsigned int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) || (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry))))) return 0; return call_int_hook(path_rename, 0, old_dir, old_dentry, new_dir, new_dentry, flags); } EXPORT_SYMBOL(security_path_rename); /** * security_path_truncate() - Check if truncating a file is allowed * @path: file * * Check permission before truncating the file indicated by path. Note that * truncation permissions may also be checked based on already opened files, * using the security_file_truncate() hook. * * Return: Returns 0 if permission is granted. */ int security_path_truncate(const struct path *path) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(path_truncate, 0, path); } /** * security_path_chmod() - Check if changing the file's mode is allowed * @path: file * @mode: new mode * * Check for permission to change a mode of the file @path. The new mode is * specified in @mode which is a bitmask of constants from * <include/uapi/linux/stat.h>. * * Return: Returns 0 if permission is granted. */ int security_path_chmod(const struct path *path, umode_t mode) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(path_chmod, 0, path, mode); } /** * security_path_chown() - Check if changing the file's owner/group is allowed * @path: file * @uid: file owner * @gid: file group * * Check for permission to change owner/group of a file or directory. * * Return: Returns 0 if permission is granted. */ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(path_chown, 0, path, uid, gid); } /** * security_path_chroot() - Check if changing the root directory is allowed * @path: directory * * Check for permission to change root directory. * * Return: Returns 0 if permission is granted. */ int security_path_chroot(const struct path *path) { return call_int_hook(path_chroot, 0, path); } #endif /* CONFIG_SECURITY_PATH */ /** * security_inode_create() - Check if creating a file is allowed * @dir: the parent directory * @dentry: the file being created * @mode: requested file mode * * Check permission to create a regular file. * * Return: Returns 0 if permission is granted. */ int security_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_create, 0, dir, dentry, mode); } EXPORT_SYMBOL_GPL(security_inode_create); /** * security_inode_link() - Check if creating a hard link is allowed * @old_dentry: existing file * @dir: new parent directory * @new_dentry: new link * * Check permission before creating a new hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)))) return 0; return call_int_hook(inode_link, 0, old_dentry, dir, new_dentry); } /** * security_inode_unlink() - Check if removing a hard link is allowed * @dir: parent directory * @dentry: file * * Check the permission to remove a hard link to a file. * * Return: Returns 0 if permission is granted. */ int security_inode_unlink(struct inode *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_unlink, 0, dir, dentry); } /** * security_inode_symlink() - Check if creating a symbolic link is allowed * @dir: parent directory * @dentry: symbolic link * @old_name: existing filename * * Check the permission to create a symbolic link to a file. * * Return: Returns 0 if permission is granted. */ int security_inode_symlink(struct inode *dir, struct dentry *dentry, const char *old_name) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_symlink, 0, dir, dentry, old_name); } /** * security_inode_mkdir() - Check if creation a new director is allowed * @dir: parent directory * @dentry: new directory * @mode: new directory mode * * Check permissions to create a new directory in the existing directory * associated with inode structure @dir. * * Return: Returns 0 if permission is granted. */ int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_mkdir, 0, dir, dentry, mode); } EXPORT_SYMBOL_GPL(security_inode_mkdir); /** * security_inode_rmdir() - Check if removing a directory is allowed * @dir: parent directory * @dentry: directory to be removed * * Check the permission to remove a directory. * * Return: Returns 0 if permission is granted. */ int security_inode_rmdir(struct inode *dir, struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_rmdir, 0, dir, dentry); } /** * security_inode_mknod() - Check if creating a special file is allowed * @dir: parent directory * @dentry: new file * @mode: new file mode * @dev: device number * * Check permissions when creating a special file (or a socket or a fifo file * created via the mknod system call). Note that if mknod operation is being * done for a regular file, then the create hook will be called and not this * hook. * * Return: Returns 0 if permission is granted. */ int security_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { if (unlikely(IS_PRIVATE(dir))) return 0; return call_int_hook(inode_mknod, 0, dir, dentry, mode, dev); } /** * security_inode_rename() - Check if renaming a file is allowed * @old_dir: parent directory of the old file * @old_dentry: the old file * @new_dir: parent directory of the new file * @new_dentry: the new file * @flags: flags * * Check for permission to rename a file or directory. * * Return: Returns 0 if permission is granted. */ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) || (d_is_positive(new_dentry) && IS_PRIVATE(d_backing_inode(new_dentry))))) return 0; if (flags & RENAME_EXCHANGE) { int err = call_int_hook(inode_rename, 0, new_dir, new_dentry, old_dir, old_dentry); if (err) return err; } return call_int_hook(inode_rename, 0, old_dir, old_dentry, new_dir, new_dentry); } /** * security_inode_readlink() - Check if reading a symbolic link is allowed * @dentry: link * * Check the permission to read the symbolic link. * * Return: Returns 0 if permission is granted. */ int security_inode_readlink(struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_readlink, 0, dentry); } /** * security_inode_follow_link() - Check if following a symbolic link is allowed * @dentry: link dentry * @inode: link inode * @rcu: true if in RCU-walk mode * * Check permission to follow a symbolic link when looking up a pathname. If * @rcu is true, @inode is not stable. * * Return: Returns 0 if permission is granted. */ int security_inode_follow_link(struct dentry *dentry, struct inode *inode, bool rcu) { if (unlikely(IS_PRIVATE(inode))) return 0; return call_int_hook(inode_follow_link, 0, dentry, inode, rcu); } /** * security_inode_permission() - Check if accessing an inode is allowed * @inode: inode * @mask: access mask * * Check permission before accessing an inode. This hook is called by the * existing Linux permission function, so a security module can use it to * provide additional checking for existing Linux permission checks. Notice * that this hook is called when a file is opened (as well as many other * operations), whereas the file_security_ops permission hook is called when * the actual read/write operations are performed. * * Return: Returns 0 if permission is granted. */ int security_inode_permission(struct inode *inode, int mask) { if (unlikely(IS_PRIVATE(inode))) return 0; return call_int_hook(inode_permission, 0, inode, mask); } /** * security_inode_setattr() - Check if setting file attributes is allowed * @idmap: idmap of the mount * @dentry: file * @attr: new attributes * * Check permission before setting file attributes. Note that the kernel call * to notify_change is performed from several locations, whenever file * attributes change (such as when a file is truncated, chown/chmod operations, * transferring disk quotas, etc). * * Return: Returns 0 if permission is granted. */ int security_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { int ret; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; ret = call_int_hook(inode_setattr, 0, dentry, attr); if (ret) return ret; return evm_inode_setattr(idmap, dentry, attr); } EXPORT_SYMBOL_GPL(security_inode_setattr); /** * security_inode_getattr() - Check if getting file attributes is allowed * @path: file * * Check permission before obtaining file attributes. * * Return: Returns 0 if permission is granted. */ int security_inode_getattr(const struct path *path) { if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry)))) return 0; return call_int_hook(inode_getattr, 0, path); } /** * security_inode_setxattr() - Check if setting file xattrs is allowed * @idmap: idmap of the mount * @dentry: file * @name: xattr name * @value: xattr value * @size: size of xattr value * @flags: flags * * Check permission before setting the extended attributes. * * Return: Returns 0 if permission is granted. */ int security_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { int ret; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; /* * SELinux and Smack integrate the cap call, * so assume that all LSMs supplying this call do so. */ ret = call_int_hook(inode_setxattr, 1, idmap, dentry, name, value, size, flags); if (ret == 1) ret = cap_inode_setxattr(dentry, name, value, size, flags); if (ret) return ret; ret = ima_inode_setxattr(dentry, name, value, size); if (ret) return ret; return evm_inode_setxattr(idmap, dentry, name, value, size); } /** * security_inode_set_acl() - Check if setting posix acls is allowed * @idmap: idmap of the mount * @dentry: file * @acl_name: acl name * @kacl: acl struct * * Check permission before setting posix acls, the posix acls in @kacl are * identified by @acl_name. * * Return: Returns 0 if permission is granted. */ int security_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl) { int ret; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; ret = call_int_hook(inode_set_acl, 0, idmap, dentry, acl_name, kacl); if (ret) return ret; ret = ima_inode_set_acl(idmap, dentry, acl_name, kacl); if (ret) return ret; return evm_inode_set_acl(idmap, dentry, acl_name, kacl); } /** * security_inode_get_acl() - Check if reading posix acls is allowed * @idmap: idmap of the mount * @dentry: file * @acl_name: acl name * * Check permission before getting osix acls, the posix acls are identified by * @acl_name. * * Return: Returns 0 if permission is granted. */ int security_inode_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_get_acl, 0, idmap, dentry, acl_name); } /** * security_inode_remove_acl() - Check if removing a posix acl is allowed * @idmap: idmap of the mount * @dentry: file * @acl_name: acl name * * Check permission before removing posix acls, the posix acls are identified * by @acl_name. * * Return: Returns 0 if permission is granted. */ int security_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { int ret; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; ret = call_int_hook(inode_remove_acl, 0, idmap, dentry, acl_name); if (ret) return ret; ret = ima_inode_remove_acl(idmap, dentry, acl_name); if (ret) return ret; return evm_inode_remove_acl(idmap, dentry, acl_name); } /** * security_inode_post_setxattr() - Update the inode after a setxattr operation * @dentry: file * @name: xattr name * @value: xattr value * @size: xattr value size * @flags: flags * * Update inode security field after successful setxattr operation. */ void security_inode_post_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return; call_void_hook(inode_post_setxattr, dentry, name, value, size, flags); evm_inode_post_setxattr(dentry, name, value, size); } /** * security_inode_getxattr() - Check if xattr access is allowed * @dentry: file * @name: xattr name * * Check permission before obtaining the extended attributes identified by * @name for @dentry. * * Return: Returns 0 if permission is granted. */ int security_inode_getxattr(struct dentry *dentry, const char *name) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_getxattr, 0, dentry, name); } /** * security_inode_listxattr() - Check if listing xattrs is allowed * @dentry: file * * Check permission before obtaining the list of extended attribute names for * @dentry. * * Return: Returns 0 if permission is granted. */ int security_inode_listxattr(struct dentry *dentry) { if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; return call_int_hook(inode_listxattr, 0, dentry); } /** * security_inode_removexattr() - Check if removing an xattr is allowed * @idmap: idmap of the mount * @dentry: file * @name: xattr name * * Check permission before removing the extended attribute identified by @name * for @dentry. * * Return: Returns 0 if permission is granted. */ int security_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { int ret; if (unlikely(IS_PRIVATE(d_backing_inode(dentry)))) return 0; /* * SELinux and Smack integrate the cap call, * so assume that all LSMs supplying this call do so. */ ret = call_int_hook(inode_removexattr, 1, idmap, dentry, name); if (ret == 1) ret = cap_inode_removexattr(idmap, dentry, name); if (ret) return ret; ret = ima_inode_removexattr(dentry, name); if (ret) return ret; return evm_inode_removexattr(idmap, dentry, name); } /** * security_inode_need_killpriv() - Check if security_inode_killpriv() required * @dentry: associated dentry * * Called when an inode has been changed to determine if * security_inode_killpriv() should be called. * * Return: Return <0 on error to abort the inode change operation, return 0 if * security_inode_killpriv() does not need to be called, return >0 if * security_inode_killpriv() does need to be called. */ int security_inode_need_killpriv(struct dentry *dentry) { return call_int_hook(inode_need_killpriv, 0, dentry); } /** * security_inode_killpriv() - The setuid bit is removed, update LSM state * @idmap: idmap of the mount * @dentry: associated dentry * * The @dentry's setuid bit is being removed. Remove similar security labels. * Called with the dentry->d_inode->i_mutex held. * * Return: Return 0 on success. If error is returned, then the operation * causing setuid bit removal is failed. */ int security_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry) { return call_int_hook(inode_killpriv, 0, idmap, dentry); } /** * security_inode_getsecurity() - Get the xattr security label of an inode * @idmap: idmap of the mount * @inode: inode * @name: xattr name * @buffer: security label buffer * @alloc: allocation flag * * Retrieve a copy of the extended attribute representation of the security * label associated with @name for @inode via @buffer. Note that @name is the * remainder of the attribute name after the security prefix has been removed. * @alloc is used to specify if the call should return a value via the buffer * or just the value length. * * Return: Returns size of buffer on success. */ int security_inode_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void **buffer, bool alloc) { struct security_hook_list *hp; int rc; if (unlikely(IS_PRIVATE(inode))) return LSM_RET_DEFAULT(inode_getsecurity); /* * Only one module will provide an attribute with a given name. */ hlist_for_each_entry(hp, &security_hook_heads.inode_getsecurity, list) { rc = hp->hook.inode_getsecurity(idmap, inode, name, buffer, alloc); if (rc != LSM_RET_DEFAULT(inode_getsecurity)) return rc; } return LSM_RET_DEFAULT(inode_getsecurity); } /** * security_inode_setsecurity() - Set the xattr security label of an inode * @inode: inode * @name: xattr name * @value: security label * @size: length of security label * @flags: flags * * Set the security label associated with @name for @inode from the extended * attribute value @value. @size indicates the size of the @value in bytes. * @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. Note that @name is the * remainder of the attribute name after the security. prefix has been removed. * * Return: Returns 0 on success. */ int security_inode_setsecurity(struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct security_hook_list *hp; int rc; if (unlikely(IS_PRIVATE(inode))) return LSM_RET_DEFAULT(inode_setsecurity); /* * Only one module will provide an attribute with a given name. */ hlist_for_each_entry(hp, &security_hook_heads.inode_setsecurity, list) { rc = hp->hook.inode_setsecurity(inode, name, value, size, flags); if (rc != LSM_RET_DEFAULT(inode_setsecurity)) return rc; } return LSM_RET_DEFAULT(inode_setsecurity); } /** * security_inode_listsecurity() - List the xattr security label names * @inode: inode * @buffer: buffer * @buffer_size: size of buffer * * Copy the extended attribute names for the security labels associated with * @inode into @buffer. The maximum size of @buffer is specified by * @buffer_size. @buffer may be NULL to request the size of the buffer * required. * * Return: Returns number of bytes used/required on success. */ int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size) { if (unlikely(IS_PRIVATE(inode))) return 0; return call_int_hook(inode_listsecurity, 0, inode, buffer, buffer_size); } EXPORT_SYMBOL(security_inode_listsecurity); /** * security_inode_getsecid() - Get an inode's secid * @inode: inode * @secid: secid to return * * Get the secid associated with the node. In case of failure, @secid will be * set to zero. */ void security_inode_getsecid(struct inode *inode, u32 *secid) { call_void_hook(inode_getsecid, inode, secid); } /** * security_inode_copy_up() - Create new creds for an overlayfs copy-up op * @src: union dentry of copy-up file * @new: newly created creds * * A file is about to be copied up from lower layer to upper layer of overlay * filesystem. Security module can prepare a set of new creds and modify as * need be and return new creds. Caller will switch to new creds temporarily to * create new file and release newly allocated creds. * * Return: Returns 0 on success or a negative error code on error. */ int security_inode_copy_up(struct dentry *src, struct cred **new) { return call_int_hook(inode_copy_up, 0, src, new); } EXPORT_SYMBOL(security_inode_copy_up); /** * security_inode_copy_up_xattr() - Filter xattrs in an overlayfs copy-up op * @name: xattr name * * Filter the xattrs being copied up when a unioned file is copied up from a * lower layer to the union/overlay layer. The caller is responsible for * reading and writing the xattrs, this hook is merely a filter. * * Return: Returns 0 to accept the xattr, 1 to discard the xattr, -EOPNOTSUPP * if the security module does not know about attribute, or a negative * error code to abort the copy up. */ int security_inode_copy_up_xattr(const char *name) { struct security_hook_list *hp; int rc; /* * The implementation can return 0 (accept the xattr), 1 (discard the * xattr), -EOPNOTSUPP if it does not know anything about the xattr or * any other error code in case of an error. */ hlist_for_each_entry(hp, &security_hook_heads.inode_copy_up_xattr, list) { rc = hp->hook.inode_copy_up_xattr(name); if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr)) return rc; } return LSM_RET_DEFAULT(inode_copy_up_xattr); } EXPORT_SYMBOL(security_inode_copy_up_xattr); /** * security_kernfs_init_security() - Init LSM context for a kernfs node * @kn_dir: parent kernfs node * @kn: the kernfs node to initialize * * Initialize the security context of a newly created kernfs node based on its * own and its parent's attributes. * * Return: Returns 0 if permission is granted. */ int security_kernfs_init_security(struct kernfs_node *kn_dir, struct kernfs_node *kn) { return call_int_hook(kernfs_init_security, 0, kn_dir, kn); } /** * security_file_permission() - Check file permissions * @file: file * @mask: requested permissions * * Check file permissions before accessing an open file. This hook is called * by various operations that read or write files. A security module can use * this hook to perform additional checking on these operations, e.g. to * revalidate permissions on use to support privilege bracketing or policy * changes. Notice that this hook is used when the actual read/write * operations are performed, whereas the inode_security_ops hook is called when * a file is opened (as well as many other operations). Although this hook can * be used to revalidate permissions for various system call operations that * read or write files, it does not address the revalidation of permissions for * memory-mapped files. Security modules must handle this separately if they * need such revalidation. * * Return: Returns 0 if permission is granted. */ int security_file_permission(struct file *file, int mask) { int ret; ret = call_int_hook(file_permission, 0, file, mask); if (ret) return ret; return fsnotify_perm(file, mask); } /** * security_file_alloc() - Allocate and init a file's LSM blob * @file: the file * * Allocate and attach a security structure to the file->f_security field. The * security field is initialized to NULL when the structure is first created. * * Return: Return 0 if the hook is successful and permission is granted. */ int security_file_alloc(struct file *file) { int rc = lsm_file_alloc(file); if (rc) return rc; rc = call_int_hook(file_alloc_security, 0, file); if (unlikely(rc)) security_file_free(file); return rc; } /** * security_file_free() - Free a file's LSM blob * @file: the file * * Deallocate and free any security structures stored in file->f_security. */ void security_file_free(struct file *file) { void *blob; call_void_hook(file_free_security, file); blob = file->f_security; if (blob) { file->f_security = NULL; kmem_cache_free(lsm_file_cache, blob); } } /** * security_file_ioctl() - Check if an ioctl is allowed * @file: associated file * @cmd: ioctl cmd * @arg: ioctl arguments * * Check permission for an ioctl operation on @file. Note that @arg sometimes * represents a user space pointer; in other cases, it may be a simple integer * value. When @arg represents a user space pointer, it should never be used * by the security module. * * Return: Returns 0 if permission is granted. */ int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_ioctl, 0, file, cmd, arg); } EXPORT_SYMBOL_GPL(security_file_ioctl); static inline unsigned long mmap_prot(struct file *file, unsigned long prot) { /* * Does we have PROT_READ and does the application expect * it to imply PROT_EXEC? If not, nothing to talk about... */ if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ) return prot; if (!(current->personality & READ_IMPLIES_EXEC)) return prot; /* * if that's an anonymous mapping, let it. */ if (!file) return prot | PROT_EXEC; /* * ditto if it's not on noexec mount, except that on !MMU we need * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case */ if (!path_noexec(&file->f_path)) { #ifndef CONFIG_MMU if (file->f_op->mmap_capabilities) { unsigned caps = file->f_op->mmap_capabilities(file); if (!(caps & NOMMU_MAP_EXEC)) return prot; } #endif return prot | PROT_EXEC; } /* anything on noexec mount won't get PROT_EXEC */ return prot; } /** * security_mmap_file() - Check if mmap'ing a file is allowed * @file: file * @prot: protection applied by the kernel * @flags: flags * * Check permissions for a mmap operation. The @file may be NULL, e.g. if * mapping anonymous memory. * * Return: Returns 0 if permission is granted. */ int security_mmap_file(struct file *file, unsigned long prot, unsigned long flags) { unsigned long prot_adj = mmap_prot(file, prot); int ret; ret = call_int_hook(mmap_file, 0, file, prot, prot_adj, flags); if (ret) return ret; return ima_file_mmap(file, prot, prot_adj, flags); } /** * security_mmap_addr() - Check if mmap'ing an address is allowed * @addr: address * * Check permissions for a mmap operation at @addr. * * Return: Returns 0 if permission is granted. */ int security_mmap_addr(unsigned long addr) { return call_int_hook(mmap_addr, 0, addr); } /** * security_file_mprotect() - Check if changing memory protections is allowed * @vma: memory region * @reqprot: application requested protection * @prot: protection applied by the kernel * * Check permissions before changing memory access permissions. * * Return: Returns 0 if permission is granted. */ int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { int ret; ret = call_int_hook(file_mprotect, 0, vma, reqprot, prot); if (ret) return ret; return ima_file_mprotect(vma, prot); } /** * security_file_lock() - Check if a file lock is allowed * @file: file * @cmd: lock operation (e.g. F_RDLCK, F_WRLCK) * * Check permission before performing file locking operations. Note the hook * mediates both flock and fcntl style locks. * * Return: Returns 0 if permission is granted. */ int security_file_lock(struct file *file, unsigned int cmd) { return call_int_hook(file_lock, 0, file, cmd); } /** * security_file_fcntl() - Check if fcntl() op is allowed * @file: file * @cmd: fcntl command * @arg: command argument * * Check permission before allowing the file operation specified by @cmd from * being performed on the file @file. Note that @arg sometimes represents a * user space pointer; in other cases, it may be a simple integer value. When * @arg represents a user space pointer, it should never be used by the * security module. * * Return: Returns 0 if permission is granted. */ int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { return call_int_hook(file_fcntl, 0, file, cmd, arg); } /** * security_file_set_fowner() - Set the file owner info in the LSM blob * @file: the file * * Save owner security information (typically from current->security) in * file->f_security for later use by the send_sigiotask hook. * * Return: Returns 0 on success. */ void security_file_set_fowner(struct file *file) { call_void_hook(file_set_fowner, file); } /** * security_file_send_sigiotask() - Check if sending SIGIO/SIGURG is allowed * @tsk: target task * @fown: signal sender * @sig: signal to be sent, SIGIO is sent if 0 * * Check permission for the file owner @fown to send SIGIO or SIGURG to the * process @tsk. Note that this hook is sometimes called from interrupt. Note * that the fown_struct, @fown, is never outside the context of a struct file, * so the file structure (and associated security information) can always be * obtained: container_of(fown, struct file, f_owner). * * Return: Returns 0 if permission is granted. */ int security_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int sig) { return call_int_hook(file_send_sigiotask, 0, tsk, fown, sig); } /** * security_file_receive() - Check is receiving a file via IPC is allowed * @file: file being received * * This hook allows security modules to control the ability of a process to * receive an open file descriptor via socket IPC. * * Return: Returns 0 if permission is granted. */ int security_file_receive(struct file *file) { return call_int_hook(file_receive, 0, file); } /** * security_file_open() - Save open() time state for late use by the LSM * @file: * * Save open-time permission checking state for later use upon file_permission, * and recheck access if anything has changed since inode_permission. * * Return: Returns 0 if permission is granted. */ int security_file_open(struct file *file) { int ret; ret = call_int_hook(file_open, 0, file); if (ret) return ret; return fsnotify_perm(file, MAY_OPEN); } /** * security_file_truncate() - Check if truncating a file is allowed * @file: file * * Check permission before truncating a file, i.e. using ftruncate. Note that * truncation permission may also be checked based on the path, using the * @path_truncate hook. * * Return: Returns 0 if permission is granted. */ int security_file_truncate(struct file *file) { return call_int_hook(file_truncate, 0, file); } /** * security_task_alloc() - Allocate a task's LSM blob * @task: the task * @clone_flags: flags indicating what is being shared * * Handle allocation of task-related resources. * * Return: Returns a zero on success, negative values on failure. */ int security_task_alloc(struct task_struct *task, unsigned long clone_flags) { int rc = lsm_task_alloc(task); if (rc) return rc; rc = call_int_hook(task_alloc, 0, task, clone_flags); if (unlikely(rc)) security_task_free(task); return rc; } /** * security_task_free() - Free a task's LSM blob and related resources * @task: task * * Handle release of task-related resources. Note that this can be called from * interrupt context. */ void security_task_free(struct task_struct *task) { call_void_hook(task_free, task); kfree(task->security); task->security = NULL; } /** * security_cred_alloc_blank() - Allocate the min memory to allow cred_transfer * @cred: credentials * @gfp: gfp flags * * Only allocate sufficient memory and attach to @cred such that * cred_transfer() will not get ENOMEM. * * Return: Returns 0 on success, negative values on failure. */ int security_cred_alloc_blank(struct cred *cred, gfp_t gfp) { int rc = lsm_cred_alloc(cred, gfp); if (rc) return rc; rc = call_int_hook(cred_alloc_blank, 0, cred, gfp); if (unlikely(rc)) security_cred_free(cred); return rc; } /** * security_cred_free() - Free the cred's LSM blob and associated resources * @cred: credentials * * Deallocate and clear the cred->security field in a set of credentials. */ void security_cred_free(struct cred *cred) { /* * There is a failure case in prepare_creds() that * may result in a call here with ->security being NULL. */ if (unlikely(cred->security == NULL)) return; call_void_hook(cred_free, cred); kfree(cred->security); cred->security = NULL; } /** * security_prepare_creds() - Prepare a new set of credentials * @new: new credentials * @old: original credentials * @gfp: gfp flags * * Prepare a new set of credentials by copying the data from the old set. * * Return: Returns 0 on success, negative values on failure. */ int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp) { int rc = lsm_cred_alloc(new, gfp); if (rc) return rc; rc = call_int_hook(cred_prepare, 0, new, old, gfp); if (unlikely(rc)) security_cred_free(new); return rc; } /** * security_transfer_creds() - Transfer creds * @new: target credentials * @old: original credentials * * Transfer data from original creds to new creds. */ void security_transfer_creds(struct cred *new, const struct cred *old) { call_void_hook(cred_transfer, new, old); } /** * security_cred_getsecid() - Get the secid from a set of credentials * @c: credentials * @secid: secid value * * Retrieve the security identifier of the cred structure @c. In case of * failure, @secid will be set to zero. */ void security_cred_getsecid(const struct cred *c, u32 *secid) { *secid = 0; call_void_hook(cred_getsecid, c, secid); } EXPORT_SYMBOL(security_cred_getsecid); /** * security_kernel_act_as() - Set the kernel credentials to act as secid * @new: credentials * @secid: secid * * Set the credentials for a kernel service to act as (subjective context). * The current task must be the one that nominated @secid. * * Return: Returns 0 if successful. */ int security_kernel_act_as(struct cred *new, u32 secid) { return call_int_hook(kernel_act_as, 0, new, secid); } /** * security_kernel_create_files_as() - Set file creation context using an inode * @new: target credentials * @inode: reference inode * * Set the file creation context in a set of credentials to be the same as the * objective context of the specified inode. The current task must be the one * that nominated @inode. * * Return: Returns 0 if successful. */ int security_kernel_create_files_as(struct cred *new, struct inode *inode) { return call_int_hook(kernel_create_files_as, 0, new, inode); } /** * security_kernel_module_request() - Check is loading a module is allowed * @kmod_name: module name * * Ability to trigger the kernel to automatically upcall to userspace for * userspace to load a kernel module with the given name. * * Return: Returns 0 if successful. */ int security_kernel_module_request(char *kmod_name) { int ret; ret = call_int_hook(kernel_module_request, 0, kmod_name); if (ret) return ret; return integrity_kernel_module_request(kmod_name); } /** * security_kernel_read_file() - Read a file specified by userspace * @file: file * @id: file identifier * @contents: trust if security_kernel_post_read_file() will be called * * Read a file specified by userspace. * * Return: Returns 0 if permission is granted. */ int security_kernel_read_file(struct file *file, enum kernel_read_file_id id, bool contents) { int ret; ret = call_int_hook(kernel_read_file, 0, file, id, contents); if (ret) return ret; return ima_read_file(file, id, contents); } EXPORT_SYMBOL_GPL(security_kernel_read_file); /** * security_kernel_post_read_file() - Read a file specified by userspace * @file: file * @buf: file contents * @size: size of file contents * @id: file identifier * * Read a file specified by userspace. This must be paired with a prior call * to security_kernel_read_file() call that indicated this hook would also be * called, see security_kernel_read_file() for more information. * * Return: Returns 0 if permission is granted. */ int security_kernel_post_read_file(struct file *file, char *buf, loff_t size, enum kernel_read_file_id id) { int ret; ret = call_int_hook(kernel_post_read_file, 0, file, buf, size, id); if (ret) return ret; return ima_post_read_file(file, buf, size, id); } EXPORT_SYMBOL_GPL(security_kernel_post_read_file); /** * security_kernel_load_data() - Load data provided by userspace * @id: data identifier * @contents: true if security_kernel_post_load_data() will be called * * Load data provided by userspace. * * Return: Returns 0 if permission is granted. */ int security_kernel_load_data(enum kernel_load_data_id id, bool contents) { int ret; ret = call_int_hook(kernel_load_data, 0, id, contents); if (ret) return ret; return ima_load_data(id, contents); } EXPORT_SYMBOL_GPL(security_kernel_load_data); /** * security_kernel_post_load_data() - Load userspace data from a non-file source * @buf: data * @size: size of data * @id: data identifier * @description: text description of data, specific to the id value * * Load data provided by a non-file source (usually userspace buffer). This * must be paired with a prior security_kernel_load_data() call that indicated * this hook would also be called, see security_kernel_load_data() for more * information. * * Return: Returns 0 if permission is granted. */ int security_kernel_post_load_data(char *buf, loff_t size, enum kernel_load_data_id id, char *description) { int ret; ret = call_int_hook(kernel_post_load_data, 0, buf, size, id, description); if (ret) return ret; return ima_post_load_data(buf, size, id, description); } EXPORT_SYMBOL_GPL(security_kernel_post_load_data); /** * security_task_fix_setuid() - Update LSM with new user id attributes * @new: updated credentials * @old: credentials being replaced * @flags: LSM_SETID_* flag values * * Update the module's state after setting one or more of the user identity * attributes of the current process. The @flags parameter indicates which of * the set*uid system calls invoked this hook. If @new is the set of * credentials that will be installed. Modifications should be made to this * rather than to @current->cred. * * Return: Returns 0 on success. */ int security_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { return call_int_hook(task_fix_setuid, 0, new, old, flags); } /** * security_task_fix_setgid() - Update LSM with new group id attributes * @new: updated credentials * @old: credentials being replaced * @flags: LSM_SETID_* flag value * * Update the module's state after setting one or more of the group identity * attributes of the current process. The @flags parameter indicates which of * the set*gid system calls invoked this hook. @new is the set of credentials * that will be installed. Modifications should be made to this rather than to * @current->cred. * * Return: Returns 0 on success. */ int security_task_fix_setgid(struct cred *new, const struct cred *old, int flags) { return call_int_hook(task_fix_setgid, 0, new, old, flags); } /** * security_task_fix_setgroups() - Update LSM with new supplementary groups * @new: updated credentials * @old: credentials being replaced * * Update the module's state after setting the supplementary group identity * attributes of the current process. @new is the set of credentials that will * be installed. Modifications should be made to this rather than to * @current->cred. * * Return: Returns 0 on success. */ int security_task_fix_setgroups(struct cred *new, const struct cred *old) { return call_int_hook(task_fix_setgroups, 0, new, old); } /** * security_task_setpgid() - Check if setting the pgid is allowed * @p: task being modified * @pgid: new pgid * * Check permission before setting the process group identifier of the process * @p to @pgid. * * Return: Returns 0 if permission is granted. */ int security_task_setpgid(struct task_struct *p, pid_t pgid) { return call_int_hook(task_setpgid, 0, p, pgid); } /** * security_task_getpgid() - Check if getting the pgid is allowed * @p: task * * Check permission before getting the process group identifier of the process * @p. * * Return: Returns 0 if permission is granted. */ int security_task_getpgid(struct task_struct *p) { return call_int_hook(task_getpgid, 0, p); } /** * security_task_getsid() - Check if getting the session id is allowed * @p: task * * Check permission before getting the session identifier of the process @p. * * Return: Returns 0 if permission is granted. */ int security_task_getsid(struct task_struct *p) { return call_int_hook(task_getsid, 0, p); } /** * security_current_getsecid_subj() - Get the current task's subjective secid * @secid: secid value * * Retrieve the subjective security identifier of the current task and return * it in @secid. In case of failure, @secid will be set to zero. */ void security_current_getsecid_subj(u32 *secid) { *secid = 0; call_void_hook(current_getsecid_subj, secid); } EXPORT_SYMBOL(security_current_getsecid_subj); /** * security_task_getsecid_obj() - Get a task's objective secid * @p: target task * @secid: secid value * * Retrieve the objective security identifier of the task_struct in @p and * return it in @secid. In case of failure, @secid will be set to zero. */ void security_task_getsecid_obj(struct task_struct *p, u32 *secid) { *secid = 0; call_void_hook(task_getsecid_obj, p, secid); } EXPORT_SYMBOL(security_task_getsecid_obj); /** * security_task_setnice() - Check if setting a task's nice value is allowed * @p: target task * @nice: nice value * * Check permission before setting the nice value of @p to @nice. * * Return: Returns 0 if permission is granted. */ int security_task_setnice(struct task_struct *p, int nice) { return call_int_hook(task_setnice, 0, p, nice); } /** * security_task_setioprio() - Check if setting a task's ioprio is allowed * @p: target task * @ioprio: ioprio value * * Check permission before setting the ioprio value of @p to @ioprio. * * Return: Returns 0 if permission is granted. */ int security_task_setioprio(struct task_struct *p, int ioprio) { return call_int_hook(task_setioprio, 0, p, ioprio); } /** * security_task_getioprio() - Check if getting a task's ioprio is allowed * @p: task * * Check permission before getting the ioprio value of @p. * * Return: Returns 0 if permission is granted. */ int security_task_getioprio(struct task_struct *p) { return call_int_hook(task_getioprio, 0, p); } /** * security_task_prlimit() - Check if get/setting resources limits is allowed * @cred: current task credentials * @tcred: target task credentials * @flags: LSM_PRLIMIT_* flag bits indicating a get/set/both * * Check permission before getting and/or setting the resource limits of * another task. * * Return: Returns 0 if permission is granted. */ int security_task_prlimit(const struct cred *cred, const struct cred *tcred, unsigned int flags) { return call_int_hook(task_prlimit, 0, cred, tcred, flags); } /** * security_task_setrlimit() - Check if setting a new rlimit value is allowed * @p: target task's group leader * @resource: resource whose limit is being set * @new_rlim: new resource limit * * Check permission before setting the resource limits of process @p for * @resource to @new_rlim. The old resource limit values can be examined by * dereferencing (p->signal->rlim + resource). * * Return: Returns 0 if permission is granted. */ int security_task_setrlimit(struct task_struct *p, unsigned int resource, struct rlimit *new_rlim) { return call_int_hook(task_setrlimit, 0, p, resource, new_rlim); } /** * security_task_setscheduler() - Check if setting sched policy/param is allowed * @p: target task * * Check permission before setting scheduling policy and/or parameters of * process @p. * * Return: Returns 0 if permission is granted. */ int security_task_setscheduler(struct task_struct *p) { return call_int_hook(task_setscheduler, 0, p); } /** * security_task_getscheduler() - Check if getting scheduling info is allowed * @p: target task * * Check permission before obtaining scheduling information for process @p. * * Return: Returns 0 if permission is granted. */ int security_task_getscheduler(struct task_struct *p) { return call_int_hook(task_getscheduler, 0, p); } /** * security_task_movememory() - Check if moving memory is allowed * @p: task * * Check permission before moving memory owned by process @p. * * Return: Returns 0 if permission is granted. */ int security_task_movememory(struct task_struct *p) { return call_int_hook(task_movememory, 0, p); } /** * security_task_kill() - Check if sending a signal is allowed * @p: target process * @info: signal information * @sig: signal value * @cred: credentials of the signal sender, NULL if @current * * Check permission before sending signal @sig to @p. @info can be NULL, the * constant 1, or a pointer to a kernel_siginfo structure. If @info is 1 or * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming from * the kernel and should typically be permitted. SIGIO signals are handled * separately by the send_sigiotask hook in file_security_ops. * * Return: Returns 0 if permission is granted. */ int security_task_kill(struct task_struct *p, struct kernel_siginfo *info, int sig, const struct cred *cred) { return call_int_hook(task_kill, 0, p, info, sig, cred); } /** * security_task_prctl() - Check if a prctl op is allowed * @option: operation * @arg2: argument * @arg3: argument * @arg4: argument * @arg5: argument * * Check permission before performing a process control operation on the * current process. * * Return: Return -ENOSYS if no-one wanted to handle this op, any other value * to cause prctl() to return immediately with that value. */ int security_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { int thisrc; int rc = LSM_RET_DEFAULT(task_prctl); struct security_hook_list *hp; hlist_for_each_entry(hp, &security_hook_heads.task_prctl, list) { thisrc = hp->hook.task_prctl(option, arg2, arg3, arg4, arg5); if (thisrc != LSM_RET_DEFAULT(task_prctl)) { rc = thisrc; if (thisrc != 0) break; } } return rc; } /** * security_task_to_inode() - Set the security attributes of a task's inode * @p: task * @inode: inode * * Set the security attributes for an inode based on an associated task's * security attributes, e.g. for /proc/pid inodes. */ void security_task_to_inode(struct task_struct *p, struct inode *inode) { call_void_hook(task_to_inode, p, inode); } /** * security_create_user_ns() - Check if creating a new userns is allowed * @cred: prepared creds * * Check permission prior to creating a new user namespace. * * Return: Returns 0 if successful, otherwise < 0 error code. */ int security_create_user_ns(const struct cred *cred) { return call_int_hook(userns_create, 0, cred); } /** * security_ipc_permission() - Check if sysv ipc access is allowed * @ipcp: ipc permission structure * @flag: requested permissions * * Check permissions for access to IPC. * * Return: Returns 0 if permission is granted. */ int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag) { return call_int_hook(ipc_permission, 0, ipcp, flag); } /** * security_ipc_getsecid() - Get the sysv ipc object's secid * @ipcp: ipc permission structure * @secid: secid pointer * * Get the secid associated with the ipc object. In case of failure, @secid * will be set to zero. */ void security_ipc_getsecid(struct kern_ipc_perm *ipcp, u32 *secid) { *secid = 0; call_void_hook(ipc_getsecid, ipcp, secid); } /** * security_msg_msg_alloc() - Allocate a sysv ipc message LSM blob * @msg: message structure * * Allocate and attach a security structure to the msg->security field. The * security field is initialized to NULL when the structure is first created. * * Return: Return 0 if operation was successful and permission is granted. */ int security_msg_msg_alloc(struct msg_msg *msg) { int rc = lsm_msg_msg_alloc(msg); if (unlikely(rc)) return rc; rc = call_int_hook(msg_msg_alloc_security, 0, msg); if (unlikely(rc)) security_msg_msg_free(msg); return rc; } /** * security_msg_msg_free() - Free a sysv ipc message LSM blob * @msg: message structure * * Deallocate the security structure for this message. */ void security_msg_msg_free(struct msg_msg *msg) { call_void_hook(msg_msg_free_security, msg); kfree(msg->security); msg->security = NULL; } /** * security_msg_queue_alloc() - Allocate a sysv ipc msg queue LSM blob * @msq: sysv ipc permission structure * * Allocate and attach a security structure to @msg. The security field is * initialized to NULL when the structure is first created. * * Return: Returns 0 if operation was successful and permission is granted. */ int security_msg_queue_alloc(struct kern_ipc_perm *msq) { int rc = lsm_ipc_alloc(msq); if (unlikely(rc)) return rc; rc = call_int_hook(msg_queue_alloc_security, 0, msq); if (unlikely(rc)) security_msg_queue_free(msq); return rc; } /** * security_msg_queue_free() - Free a sysv ipc msg queue LSM blob * @msq: sysv ipc permission structure * * Deallocate security field @perm->security for the message queue. */ void security_msg_queue_free(struct kern_ipc_perm *msq) { call_void_hook(msg_queue_free_security, msq); kfree(msq->security); msq->security = NULL; } /** * security_msg_queue_associate() - Check if a msg queue operation is allowed * @msq: sysv ipc permission structure * @msqflg: operation flags * * Check permission when a message queue is requested through the msgget system * call. This hook is only called when returning the message queue identifier * for an existing message queue, not when a new message queue is created. * * Return: Return 0 if permission is granted. */ int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg) { return call_int_hook(msg_queue_associate, 0, msq, msqflg); } /** * security_msg_queue_msgctl() - Check if a msg queue operation is allowed * @msq: sysv ipc permission structure * @cmd: operation * * Check permission when a message control operation specified by @cmd is to be * performed on the message queue with permissions. * * Return: Returns 0 if permission is granted. */ int security_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd) { return call_int_hook(msg_queue_msgctl, 0, msq, cmd); } /** * security_msg_queue_msgsnd() - Check if sending a sysv ipc message is allowed * @msq: sysv ipc permission structure * @msg: message * @msqflg: operation flags * * Check permission before a message, @msg, is enqueued on the message queue * with permissions specified in @msq. * * Return: Returns 0 if permission is granted. */ int security_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *msg, int msqflg) { return call_int_hook(msg_queue_msgsnd, 0, msq, msg, msqflg); } /** * security_msg_queue_msgrcv() - Check if receiving a sysv ipc msg is allowed * @msq: sysv ipc permission structure * @msg: message * @target: target task * @type: type of message requested * @mode: operation flags * * Check permission before a message, @msg, is removed from the message queue. * The @target task structure contains a pointer to the process that will be * receiving the message (not equal to the current process when inline receives * are being performed). * * Return: Returns 0 if permission is granted. */ int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg, struct task_struct *target, long type, int mode) { return call_int_hook(msg_queue_msgrcv, 0, msq, msg, target, type, mode); } /** * security_shm_alloc() - Allocate a sysv shm LSM blob * @shp: sysv ipc permission structure * * Allocate and attach a security structure to the @shp security field. The * security field is initialized to NULL when the structure is first created. * * Return: Returns 0 if operation was successful and permission is granted. */ int security_shm_alloc(struct kern_ipc_perm *shp) { int rc = lsm_ipc_alloc(shp); if (unlikely(rc)) return rc; rc = call_int_hook(shm_alloc_security, 0, shp); if (unlikely(rc)) security_shm_free(shp); return rc; } /** * security_shm_free() - Free a sysv shm LSM blob * @shp: sysv ipc permission structure * * Deallocate the security structure @perm->security for the memory segment. */ void security_shm_free(struct kern_ipc_perm *shp) { call_void_hook(shm_free_security, shp); kfree(shp->security); shp->security = NULL; } /** * security_shm_associate() - Check if a sysv shm operation is allowed * @shp: sysv ipc permission structure * @shmflg: operation flags * * Check permission when a shared memory region is requested through the shmget * system call. This hook is only called when returning the shared memory * region identifier for an existing region, not when a new shared memory * region is created. * * Return: Returns 0 if permission is granted. */ int security_shm_associate(struct kern_ipc_perm *shp, int shmflg) { return call_int_hook(shm_associate, 0, shp, shmflg); } /** * security_shm_shmctl() - Check if a sysv shm operation is allowed * @shp: sysv ipc permission structure * @cmd: operation * * Check permission when a shared memory control operation specified by @cmd is * to be performed on the shared memory region with permissions in @shp. * * Return: Return 0 if permission is granted. */ int security_shm_shmctl(struct kern_ipc_perm *shp, int cmd) { return call_int_hook(shm_shmctl, 0, shp, cmd); } /** * security_shm_shmat() - Check if a sysv shm attach operation is allowed * @shp: sysv ipc permission structure * @shmaddr: address of memory region to attach * @shmflg: operation flags * * Check permissions prior to allowing the shmat system call to attach the * shared memory segment with permissions @shp to the data segment of the * calling process. The attaching address is specified by @shmaddr. * * Return: Returns 0 if permission is granted. */ int security_shm_shmat(struct kern_ipc_perm *shp, char __user *shmaddr, int shmflg) { return call_int_hook(shm_shmat, 0, shp, shmaddr, shmflg); } /** * security_sem_alloc() - Allocate a sysv semaphore LSM blob * @sma: sysv ipc permission structure * * Allocate and attach a security structure to the @sma security field. The * security field is initialized to NULL when the structure is first created. * * Return: Returns 0 if operation was successful and permission is granted. */ int security_sem_alloc(struct kern_ipc_perm *sma) { int rc = lsm_ipc_alloc(sma); if (unlikely(rc)) return rc; rc = call_int_hook(sem_alloc_security, 0, sma); if (unlikely(rc)) security_sem_free(sma); return rc; } /** * security_sem_free() - Free a sysv semaphore LSM blob * @sma: sysv ipc permission structure * * Deallocate security structure @sma->security for the semaphore. */ void security_sem_free(struct kern_ipc_perm *sma) { call_void_hook(sem_free_security, sma); kfree(sma->security); sma->security = NULL; } /** * security_sem_associate() - Check if a sysv semaphore operation is allowed * @sma: sysv ipc permission structure * @semflg: operation flags * * Check permission when a semaphore is requested through the semget system * call. This hook is only called when returning the semaphore identifier for * an existing semaphore, not when a new one must be created. * * Return: Returns 0 if permission is granted. */ int security_sem_associate(struct kern_ipc_perm *sma, int semflg) { return call_int_hook(sem_associate, 0, sma, semflg); } /** * security_sem_semctl() - Check if a sysv semaphore operation is allowed * @sma: sysv ipc permission structure * @cmd: operation * * Check permission when a semaphore operation specified by @cmd is to be * performed on the semaphore. * * Return: Returns 0 if permission is granted. */ int security_sem_semctl(struct kern_ipc_perm *sma, int cmd) { return call_int_hook(sem_semctl, 0, sma, cmd); } /** * security_sem_semop() - Check if a sysv semaphore operation is allowed * @sma: sysv ipc permission structure * @sops: operations to perform * @nsops: number of operations * @alter: flag indicating changes will be made * * Check permissions before performing operations on members of the semaphore * set. If the @alter flag is nonzero, the semaphore set may be modified. * * Return: Returns 0 if permission is granted. */ int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops, unsigned nsops, int alter) { return call_int_hook(sem_semop, 0, sma, sops, nsops, alter); } /** * security_d_instantiate() - Populate an inode's LSM state based on a dentry * @dentry: dentry * @inode: inode * * Fill in @inode security information for a @dentry if allowed. */ void security_d_instantiate(struct dentry *dentry, struct inode *inode) { if (unlikely(inode && IS_PRIVATE(inode))) return; call_void_hook(d_instantiate, dentry, inode); } EXPORT_SYMBOL(security_d_instantiate); /** * security_getprocattr() - Read an attribute for a task * @p: the task * @lsm: LSM name * @name: attribute name * @value: attribute value * * Read attribute @name for task @p and store it into @value if allowed. * * Return: Returns the length of @value on success, a negative value otherwise. */ int security_getprocattr(struct task_struct *p, const char *lsm, const char *name, char **value) { struct security_hook_list *hp; hlist_for_each_entry(hp, &security_hook_heads.getprocattr, list) { if (lsm != NULL && strcmp(lsm, hp->lsm)) continue; return hp->hook.getprocattr(p, name, value); } return LSM_RET_DEFAULT(getprocattr); } /** * security_setprocattr() - Set an attribute for a task * @lsm: LSM name * @name: attribute name * @value: attribute value * @size: attribute value size * * Write (set) the current task's attribute @name to @value, size @size if * allowed. * * Return: Returns bytes written on success, a negative value otherwise. */ int security_setprocattr(const char *lsm, const char *name, void *value, size_t size) { struct security_hook_list *hp; hlist_for_each_entry(hp, &security_hook_heads.setprocattr, list) { if (lsm != NULL && strcmp(lsm, hp->lsm)) continue; return hp->hook.setprocattr(name, value, size); } return LSM_RET_DEFAULT(setprocattr); } /** * security_netlink_send() - Save info and check if netlink sending is allowed * @sk: sending socket * @skb: netlink message * * Save security information for a netlink message so that permission checking * can be performed when the message is processed. The security information * can be saved using the eff_cap field of the netlink_skb_parms structure. * Also may be used to provide fine grained control over message transmission. * * Return: Returns 0 if the information was successfully saved and message is * allowed to be transmitted. */ int security_netlink_send(struct sock *sk, struct sk_buff *skb) { return call_int_hook(netlink_send, 0, sk, skb); } /** * security_ismaclabel() - Check is the named attribute is a MAC label * @name: full extended attribute name * * Check if the extended attribute specified by @name represents a MAC label. * * Return: Returns 1 if name is a MAC attribute otherwise returns 0. */ int security_ismaclabel(const char *name) { return call_int_hook(ismaclabel, 0, name); } EXPORT_SYMBOL(security_ismaclabel); /** * security_secid_to_secctx() - Convert a secid to a secctx * @secid: secid * @secdata: secctx * @seclen: secctx length * * Convert secid to security context. If @secdata is NULL the length of the * result will be returned in @seclen, but no @secdata will be returned. This * does mean that the length could change between calls to check the length and * the next call which actually allocates and returns the @secdata. * * Return: Return 0 on success, error on failure. */ int security_secid_to_secctx(u32 secid, char **secdata, u32 *seclen) { struct security_hook_list *hp; int rc; /* * Currently, only one LSM can implement secid_to_secctx (i.e this * LSM hook is not "stackable"). */ hlist_for_each_entry(hp, &security_hook_heads.secid_to_secctx, list) { rc = hp->hook.secid_to_secctx(secid, secdata, seclen); if (rc != LSM_RET_DEFAULT(secid_to_secctx)) return rc; } return LSM_RET_DEFAULT(secid_to_secctx); } EXPORT_SYMBOL(security_secid_to_secctx); /** * security_secctx_to_secid() - Convert a secctx to a secid * @secdata: secctx * @seclen: length of secctx * @secid: secid * * Convert security context to secid. * * Return: Returns 0 on success, error on failure. */ int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) { *secid = 0; return call_int_hook(secctx_to_secid, 0, secdata, seclen, secid); } EXPORT_SYMBOL(security_secctx_to_secid); /** * security_release_secctx() - Free a secctx buffer * @secdata: secctx * @seclen: length of secctx * * Release the security context. */ void security_release_secctx(char *secdata, u32 seclen) { call_void_hook(release_secctx, secdata, seclen); } EXPORT_SYMBOL(security_release_secctx); /** * security_inode_invalidate_secctx() - Invalidate an inode's security label * @inode: inode * * Notify the security module that it must revalidate the security context of * an inode. */ void security_inode_invalidate_secctx(struct inode *inode) { call_void_hook(inode_invalidate_secctx, inode); } EXPORT_SYMBOL(security_inode_invalidate_secctx); /** * security_inode_notifysecctx() - Nofify the LSM of an inode's security label * @inode: inode * @ctx: secctx * @ctxlen: length of secctx * * Notify the security module of what the security context of an inode should * be. Initializes the incore security context managed by the security module * for this inode. Example usage: NFS client invokes this hook to initialize * the security context in its incore inode to the value provided by the server * for the file when the server returned the file's attributes to the client. * Must be called with inode->i_mutex locked. * * Return: Returns 0 on success, error on failure. */ int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen) { return call_int_hook(inode_notifysecctx, 0, inode, ctx, ctxlen); } EXPORT_SYMBOL(security_inode_notifysecctx); /** * security_inode_setsecctx() - Change the security label of an inode * @dentry: inode * @ctx: secctx * @ctxlen: length of secctx * * Change the security context of an inode. Updates the incore security * context managed by the security module and invokes the fs code as needed * (via __vfs_setxattr_noperm) to update any backing xattrs that represent the * context. Example usage: NFS server invokes this hook to change the security * context in its incore inode and on the backing filesystem to a value * provided by the client on a SETATTR operation. Must be called with * inode->i_mutex locked. * * Return: Returns 0 on success, error on failure. */ int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen) { return call_int_hook(inode_setsecctx, 0, dentry, ctx, ctxlen); } EXPORT_SYMBOL(security_inode_setsecctx); /** * security_inode_getsecctx() - Get the security label of an inode * @inode: inode * @ctx: secctx * @ctxlen: length of secctx * * On success, returns 0 and fills out @ctx and @ctxlen with the security * context for the given @inode. * * Return: Returns 0 on success, error on failure. */ int security_inode_getsecctx(struct inode *inode, void **ctx, u32 *ctxlen) { return call_int_hook(inode_getsecctx, -EOPNOTSUPP, inode, ctx, ctxlen); } EXPORT_SYMBOL(security_inode_getsecctx); #ifdef CONFIG_WATCH_QUEUE /** * security_post_notification() - Check if a watch notification can be posted * @w_cred: credentials of the task that set the watch * @cred: credentials of the task which triggered the watch * @n: the notification * * Check to see if a watch notification can be posted to a particular queue. * * Return: Returns 0 if permission is granted. */ int security_post_notification(const struct cred *w_cred, const struct cred *cred, struct watch_notification *n) { return call_int_hook(post_notification, 0, w_cred, cred, n); } #endif /* CONFIG_WATCH_QUEUE */ #ifdef CONFIG_KEY_NOTIFICATIONS /** * security_watch_key() - Check if a task is allowed to watch for key events * @key: the key to watch * * Check to see if a process is allowed to watch for event notifications from * a key or keyring. * * Return: Returns 0 if permission is granted. */ int security_watch_key(struct key *key) { return call_int_hook(watch_key, 0, key); } #endif /* CONFIG_KEY_NOTIFICATIONS */ #ifdef CONFIG_SECURITY_NETWORK /** * security_unix_stream_connect() - Check if a AF_UNIX stream is allowed * @sock: originating sock * @other: peer sock * @newsk: new sock * * Check permissions before establishing a Unix domain stream connection * between @sock and @other. * * The @unix_stream_connect and @unix_may_send hooks were necessary because * Linux provides an alternative to the conventional file name space for Unix * domain sockets. Whereas binding and connecting to sockets in the file name * space is mediated by the typical file permissions (and caught by the mknod * and permission hooks in inode_security_ops), binding and connecting to * sockets in the abstract name space is completely unmediated. Sufficient * control of Unix domain sockets in the abstract name space isn't possible * using only the socket layer hooks, since we need to know the actual target * socket, which is not looked up until we are inside the af_unix code. * * Return: Returns 0 if permission is granted. */ int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) { return call_int_hook(unix_stream_connect, 0, sock, other, newsk); } EXPORT_SYMBOL(security_unix_stream_connect); /** * security_unix_may_send() - Check if AF_UNIX socket can send datagrams * @sock: originating sock * @other: peer sock * * Check permissions before connecting or sending datagrams from @sock to * @other. * * The @unix_stream_connect and @unix_may_send hooks were necessary because * Linux provides an alternative to the conventional file name space for Unix * domain sockets. Whereas binding and connecting to sockets in the file name * space is mediated by the typical file permissions (and caught by the mknod * and permission hooks in inode_security_ops), binding and connecting to * sockets in the abstract name space is completely unmediated. Sufficient * control of Unix domain sockets in the abstract name space isn't possible * using only the socket layer hooks, since we need to know the actual target * socket, which is not looked up until we are inside the af_unix code. * * Return: Returns 0 if permission is granted. */ int security_unix_may_send(struct socket *sock, struct socket *other) { return call_int_hook(unix_may_send, 0, sock, other); } EXPORT_SYMBOL(security_unix_may_send); /** * security_socket_create() - Check if creating a new socket is allowed * @family: protocol family * @type: communications type * @protocol: requested protocol * @kern: set to 1 if a kernel socket is requested * * Check permissions prior to creating a new socket. * * Return: Returns 0 if permission is granted. */ int security_socket_create(int family, int type, int protocol, int kern) { return call_int_hook(socket_create, 0, family, type, protocol, kern); } /** * security_socket_post_create() - Initialize a newly created socket * @sock: socket * @family: protocol family * @type: communications type * @protocol: requested protocol * @kern: set to 1 if a kernel socket is requested * * This hook allows a module to update or allocate a per-socket security * structure. Note that the security field was not added directly to the socket * structure, but rather, the socket security information is stored in the * associated inode. Typically, the inode alloc_security hook will allocate * and attach security information to SOCK_INODE(sock)->i_security. This hook * may be used to update the SOCK_INODE(sock)->i_security field with additional * information that wasn't available when the inode was allocated. * * Return: Returns 0 if permission is granted. */ int security_socket_post_create(struct socket *sock, int family, int type, int protocol, int kern) { return call_int_hook(socket_post_create, 0, sock, family, type, protocol, kern); } /** * security_socket_socketpair() - Check if creating a socketpair is allowed * @socka: first socket * @sockb: second socket * * Check permissions before creating a fresh pair of sockets. * * Return: Returns 0 if permission is granted and the connection was * established. */ int security_socket_socketpair(struct socket *socka, struct socket *sockb) { return call_int_hook(socket_socketpair, 0, socka, sockb); } EXPORT_SYMBOL(security_socket_socketpair); /** * security_socket_bind() - Check if a socket bind operation is allowed * @sock: socket * @address: requested bind address * @addrlen: length of address * * Check permission before socket protocol layer bind operation is performed * and the socket @sock is bound to the address specified in the @address * parameter. * * Return: Returns 0 if permission is granted. */ int security_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen) { return call_int_hook(socket_bind, 0, sock, address, addrlen); } /** * security_socket_connect() - Check if a socket connect operation is allowed * @sock: socket * @address: address of remote connection point * @addrlen: length of address * * Check permission before socket protocol layer connect operation attempts to * connect socket @sock to a remote address, @address. * * Return: Returns 0 if permission is granted. */ int security_socket_connect(struct socket *sock, struct sockaddr *address, int addrlen) { return call_int_hook(socket_connect, 0, sock, address, addrlen); } /** * security_socket_listen() - Check if a socket is allowed to listen * @sock: socket * @backlog: connection queue size * * Check permission before socket protocol layer listen operation. * * Return: Returns 0 if permission is granted. */ int security_socket_listen(struct socket *sock, int backlog) { return call_int_hook(socket_listen, 0, sock, backlog); } /** * security_socket_accept() - Check if a socket is allowed to accept connections * @sock: listening socket * @newsock: newly creation connection socket * * Check permission before accepting a new connection. Note that the new * socket, @newsock, has been created and some information copied to it, but * the accept operation has not actually been performed. * * Return: Returns 0 if permission is granted. */ int security_socket_accept(struct socket *sock, struct socket *newsock) { return call_int_hook(socket_accept, 0, sock, newsock); } /** * security_socket_sendmsg() - Check is sending a message is allowed * @sock: sending socket * @msg: message to send * @size: size of message * * Check permission before transmitting a message to another socket. * * Return: Returns 0 if permission is granted. */ int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size) { return call_int_hook(socket_sendmsg, 0, sock, msg, size); } /** * security_socket_recvmsg() - Check if receiving a message is allowed * @sock: receiving socket * @msg: message to receive * @size: size of message * @flags: operational flags * * Check permission before receiving a message from a socket. * * Return: Returns 0 if permission is granted. */ int security_socket_recvmsg(struct socket *sock, struct msghdr *msg, int size, int flags) { return call_int_hook(socket_recvmsg, 0, sock, msg, size, flags); } /** * security_socket_getsockname() - Check if reading the socket addr is allowed * @sock: socket * * Check permission before reading the local address (name) of the socket * object. * * Return: Returns 0 if permission is granted. */ int security_socket_getsockname(struct socket *sock) { return call_int_hook(socket_getsockname, 0, sock); } /** * security_socket_getpeername() - Check if reading the peer's addr is allowed * @sock: socket * * Check permission before the remote address (name) of a socket object. * * Return: Returns 0 if permission is granted. */ int security_socket_getpeername(struct socket *sock) { return call_int_hook(socket_getpeername, 0, sock); } /** * security_socket_getsockopt() - Check if reading a socket option is allowed * @sock: socket * @level: option's protocol level * @optname: option name * * Check permissions before retrieving the options associated with socket * @sock. * * Return: Returns 0 if permission is granted. */ int security_socket_getsockopt(struct socket *sock, int level, int optname) { return call_int_hook(socket_getsockopt, 0, sock, level, optname); } /** * security_socket_setsockopt() - Check if setting a socket option is allowed * @sock: socket * @level: option's protocol level * @optname: option name * * Check permissions before setting the options associated with socket @sock. * * Return: Returns 0 if permission is granted. */ int security_socket_setsockopt(struct socket *sock, int level, int optname) { return call_int_hook(socket_setsockopt, 0, sock, level, optname); } /** * security_socket_shutdown() - Checks if shutting down the socket is allowed * @sock: socket * @how: flag indicating how sends and receives are handled * * Checks permission before all or part of a connection on the socket @sock is * shut down. * * Return: Returns 0 if permission is granted. */ int security_socket_shutdown(struct socket *sock, int how) { return call_int_hook(socket_shutdown, 0, sock, how); } /** * security_sock_rcv_skb() - Check if an incoming network packet is allowed * @sk: destination sock * @skb: incoming packet * * Check permissions on incoming network packets. This hook is distinct from * Netfilter's IP input hooks since it is the first time that the incoming * sk_buff @skb has been associated with a particular socket, @sk. Must not * sleep inside this hook because some callers hold spinlocks. * * Return: Returns 0 if permission is granted. */ int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) { return call_int_hook(socket_sock_rcv_skb, 0, sk, skb); } EXPORT_SYMBOL(security_sock_rcv_skb); /** * security_socket_getpeersec_stream() - Get the remote peer label * @sock: socket * @optval: destination buffer * @optlen: size of peer label copied into the buffer * @len: maximum size of the destination buffer * * This hook allows the security module to provide peer socket security state * for unix or connected tcp sockets to userspace via getsockopt SO_GETPEERSEC. * For tcp sockets this can be meaningful if the socket is associated with an * ipsec SA. * * Return: Returns 0 if all is well, otherwise, typical getsockopt return * values. */ int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval, sockptr_t optlen, unsigned int len) { return call_int_hook(socket_getpeersec_stream, -ENOPROTOOPT, sock, optval, optlen, len); } /** * security_socket_getpeersec_dgram() - Get the remote peer label * @sock: socket * @skb: datagram packet * @secid: remote peer label secid * * This hook allows the security module to provide peer socket security state * for udp sockets on a per-packet basis to userspace via getsockopt * SO_GETPEERSEC. The application must first have indicated the IP_PASSSEC * option via getsockopt. It can then retrieve the security state returned by * this hook for a packet via the SCM_SECURITY ancillary message type. * * Return: Returns 0 on success, error on failure. */ int security_socket_getpeersec_dgram(struct socket *sock, struct sk_buff *skb, u32 *secid) { return call_int_hook(socket_getpeersec_dgram, -ENOPROTOOPT, sock, skb, secid); } EXPORT_SYMBOL(security_socket_getpeersec_dgram); /** * security_sk_alloc() - Allocate and initialize a sock's LSM blob * @sk: sock * @family: protocol family * @priority: gfp flags * * Allocate and attach a security structure to the sk->sk_security field, which * is used to copy security attributes between local stream sockets. * * Return: Returns 0 on success, error on failure. */ int security_sk_alloc(struct sock *sk, int family, gfp_t priority) { return call_int_hook(sk_alloc_security, 0, sk, family, priority); } /** * security_sk_free() - Free the sock's LSM blob * @sk: sock * * Deallocate security structure. */ void security_sk_free(struct sock *sk) { call_void_hook(sk_free_security, sk); } /** * security_sk_clone() - Clone a sock's LSM state * @sk: original sock * @newsk: target sock * * Clone/copy security structure. */ void security_sk_clone(const struct sock *sk, struct sock *newsk) { call_void_hook(sk_clone_security, sk, newsk); } EXPORT_SYMBOL(security_sk_clone); /** * security_sk_classify_flow() - Set a flow's secid based on socket * @sk: original socket * @flic: target flow * * Set the target flow's secid to socket's secid. */ void security_sk_classify_flow(const struct sock *sk, struct flowi_common *flic) { call_void_hook(sk_getsecid, sk, &flic->flowic_secid); } EXPORT_SYMBOL(security_sk_classify_flow); /** * security_req_classify_flow() - Set a flow's secid based on request_sock * @req: request_sock * @flic: target flow * * Sets @flic's secid to @req's secid. */ void security_req_classify_flow(const struct request_sock *req, struct flowi_common *flic) { call_void_hook(req_classify_flow, req, flic); } EXPORT_SYMBOL(security_req_classify_flow); /** * security_sock_graft() - Reconcile LSM state when grafting a sock on a socket * @sk: sock being grafted * @parent: target parent socket * * Sets @parent's inode secid to @sk's secid and update @sk with any necessary * LSM state from @parent. */ void security_sock_graft(struct sock *sk, struct socket *parent) { call_void_hook(sock_graft, sk, parent); } EXPORT_SYMBOL(security_sock_graft); /** * security_inet_conn_request() - Set request_sock state using incoming connect * @sk: parent listening sock * @skb: incoming connection * @req: new request_sock * * Initialize the @req LSM state based on @sk and the incoming connect in @skb. * * Return: Returns 0 if permission is granted. */ int security_inet_conn_request(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { return call_int_hook(inet_conn_request, 0, sk, skb, req); } EXPORT_SYMBOL(security_inet_conn_request); /** * security_inet_csk_clone() - Set new sock LSM state based on request_sock * @newsk: new sock * @req: connection request_sock * * Set that LSM state of @sock using the LSM state from @req. */ void security_inet_csk_clone(struct sock *newsk, const struct request_sock *req) { call_void_hook(inet_csk_clone, newsk, req); } /** * security_inet_conn_established() - Update sock's LSM state with connection * @sk: sock * @skb: connection packet * * Update @sock's LSM state to represent a new connection from @skb. */ void security_inet_conn_established(struct sock *sk, struct sk_buff *skb) { call_void_hook(inet_conn_established, sk, skb); } EXPORT_SYMBOL(security_inet_conn_established); /** * security_secmark_relabel_packet() - Check if setting a secmark is allowed * @secid: new secmark value * * Check if the process should be allowed to relabel packets to @secid. * * Return: Returns 0 if permission is granted. */ int security_secmark_relabel_packet(u32 secid) { return call_int_hook(secmark_relabel_packet, 0, secid); } EXPORT_SYMBOL(security_secmark_relabel_packet); /** * security_secmark_refcount_inc() - Increment the secmark labeling rule count * * Tells the LSM to increment the number of secmark labeling rules loaded. */ void security_secmark_refcount_inc(void) { call_void_hook(secmark_refcount_inc); } EXPORT_SYMBOL(security_secmark_refcount_inc); /** * security_secmark_refcount_dec() - Decrement the secmark labeling rule count * * Tells the LSM to decrement the number of secmark labeling rules loaded. */ void security_secmark_refcount_dec(void) { call_void_hook(secmark_refcount_dec); } EXPORT_SYMBOL(security_secmark_refcount_dec); /** * security_tun_dev_alloc_security() - Allocate a LSM blob for a TUN device * @security: pointer to the LSM blob * * This hook allows a module to allocate a security structure for a TUN device, * returning the pointer in @security. * * Return: Returns a zero on success, negative values on failure. */ int security_tun_dev_alloc_security(void **security) { return call_int_hook(tun_dev_alloc_security, 0, security); } EXPORT_SYMBOL(security_tun_dev_alloc_security); /** * security_tun_dev_free_security() - Free a TUN device LSM blob * @security: LSM blob * * This hook allows a module to free the security structure for a TUN device. */ void security_tun_dev_free_security(void *security) { call_void_hook(tun_dev_free_security, security); } EXPORT_SYMBOL(security_tun_dev_free_security); /** * security_tun_dev_create() - Check if creating a TUN device is allowed * * Check permissions prior to creating a new TUN device. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_create(void) { return call_int_hook(tun_dev_create, 0); } EXPORT_SYMBOL(security_tun_dev_create); /** * security_tun_dev_attach_queue() - Check if attaching a TUN queue is allowed * @security: TUN device LSM blob * * Check permissions prior to attaching to a TUN device queue. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_attach_queue(void *security) { return call_int_hook(tun_dev_attach_queue, 0, security); } EXPORT_SYMBOL(security_tun_dev_attach_queue); /** * security_tun_dev_attach() - Update TUN device LSM state on attach * @sk: associated sock * @security: TUN device LSM blob * * This hook can be used by the module to update any security state associated * with the TUN device's sock structure. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_attach(struct sock *sk, void *security) { return call_int_hook(tun_dev_attach, 0, sk, security); } EXPORT_SYMBOL(security_tun_dev_attach); /** * security_tun_dev_open() - Update TUN device LSM state on open * @security: TUN device LSM blob * * This hook can be used by the module to update any security state associated * with the TUN device's security structure. * * Return: Returns 0 if permission is granted. */ int security_tun_dev_open(void *security) { return call_int_hook(tun_dev_open, 0, security); } EXPORT_SYMBOL(security_tun_dev_open); /** * security_sctp_assoc_request() - Update the LSM on a SCTP association req * @asoc: SCTP association * @skb: packet requesting the association * * Passes the @asoc and @chunk->skb of the association INIT packet to the LSM. * * Return: Returns 0 on success, error on failure. */ int security_sctp_assoc_request(struct sctp_association *asoc, struct sk_buff *skb) { return call_int_hook(sctp_assoc_request, 0, asoc, skb); } EXPORT_SYMBOL(security_sctp_assoc_request); /** * security_sctp_bind_connect() - Validate a list of addrs for a SCTP option * @sk: socket * @optname: SCTP option to validate * @address: list of IP addresses to validate * @addrlen: length of the address list * * Validiate permissions required for each address associated with sock @sk. * Depending on @optname, the addresses will be treated as either a connect or * bind service. The @addrlen is calculated on each IPv4 and IPv6 address using * sizeof(struct sockaddr_in) or sizeof(struct sockaddr_in6). * * Return: Returns 0 on success, error on failure. */ int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen) { return call_int_hook(sctp_bind_connect, 0, sk, optname, address, addrlen); } EXPORT_SYMBOL(security_sctp_bind_connect); /** * security_sctp_sk_clone() - Clone a SCTP sock's LSM state * @asoc: SCTP association * @sk: original sock * @newsk: target sock * * Called whenever a new socket is created by accept(2) (i.e. a TCP style * socket) or when a socket is 'peeled off' e.g userspace calls * sctp_peeloff(3). */ void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk) { call_void_hook(sctp_sk_clone, asoc, sk, newsk); } EXPORT_SYMBOL(security_sctp_sk_clone); /** * security_sctp_assoc_established() - Update LSM state when assoc established * @asoc: SCTP association * @skb: packet establishing the association * * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet to the * security module. * * Return: Returns 0 if permission is granted. */ int security_sctp_assoc_established(struct sctp_association *asoc, struct sk_buff *skb) { return call_int_hook(sctp_assoc_established, 0, asoc, skb); } EXPORT_SYMBOL(security_sctp_assoc_established); /** * security_mptcp_add_subflow() - Inherit the LSM label from the MPTCP socket * @sk: the owning MPTCP socket * @ssk: the new subflow * * Update the labeling for the given MPTCP subflow, to match the one of the * owning MPTCP socket. This hook has to be called after the socket creation and * initialization via the security_socket_create() and * security_socket_post_create() LSM hooks. * * Return: Returns 0 on success or a negative error code on failure. */ int security_mptcp_add_subflow(struct sock *sk, struct sock *ssk) { return call_int_hook(mptcp_add_subflow, 0, sk, ssk); } #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND /** * security_ib_pkey_access() - Check if access to an IB pkey is allowed * @sec: LSM blob * @subnet_prefix: subnet prefix of the port * @pkey: IB pkey * * Check permission to access a pkey when modifying a QP. * * Return: Returns 0 if permission is granted. */ int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey) { return call_int_hook(ib_pkey_access, 0, sec, subnet_prefix, pkey); } EXPORT_SYMBOL(security_ib_pkey_access); /** * security_ib_endport_manage_subnet() - Check if SMPs traffic is allowed * @sec: LSM blob * @dev_name: IB device name * @port_num: port number * * Check permissions to send and receive SMPs on a end port. * * Return: Returns 0 if permission is granted. */ int security_ib_endport_manage_subnet(void *sec, const char *dev_name, u8 port_num) { return call_int_hook(ib_endport_manage_subnet, 0, sec, dev_name, port_num); } EXPORT_SYMBOL(security_ib_endport_manage_subnet); /** * security_ib_alloc_security() - Allocate an Infiniband LSM blob * @sec: LSM blob * * Allocate a security structure for Infiniband objects. * * Return: Returns 0 on success, non-zero on failure. */ int security_ib_alloc_security(void **sec) { return call_int_hook(ib_alloc_security, 0, sec); } EXPORT_SYMBOL(security_ib_alloc_security); /** * security_ib_free_security() - Free an Infiniband LSM blob * @sec: LSM blob * * Deallocate an Infiniband security structure. */ void security_ib_free_security(void *sec) { call_void_hook(ib_free_security, sec); } EXPORT_SYMBOL(security_ib_free_security); #endif /* CONFIG_SECURITY_INFINIBAND */ #ifdef CONFIG_SECURITY_NETWORK_XFRM /** * security_xfrm_policy_alloc() - Allocate a xfrm policy LSM blob * @ctxp: xfrm security context being added to the SPD * @sec_ctx: security label provided by userspace * @gfp: gfp flags * * Allocate a security structure to the xp->security field; the security field * is initialized to NULL when the xfrm_policy is allocated. * * Return: Return 0 if operation was successful. */ int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *sec_ctx, gfp_t gfp) { return call_int_hook(xfrm_policy_alloc_security, 0, ctxp, sec_ctx, gfp); } EXPORT_SYMBOL(security_xfrm_policy_alloc); /** * security_xfrm_policy_clone() - Clone xfrm policy LSM state * @old_ctx: xfrm security context * @new_ctxp: target xfrm security context * * Allocate a security structure in new_ctxp that contains the information from * the old_ctx structure. * * Return: Return 0 if operation was successful. */ int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx, struct xfrm_sec_ctx **new_ctxp) { return call_int_hook(xfrm_policy_clone_security, 0, old_ctx, new_ctxp); } /** * security_xfrm_policy_free() - Free a xfrm security context * @ctx: xfrm security context * * Free LSM resources associated with @ctx. */ void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx) { call_void_hook(xfrm_policy_free_security, ctx); } EXPORT_SYMBOL(security_xfrm_policy_free); /** * security_xfrm_policy_delete() - Check if deleting a xfrm policy is allowed * @ctx: xfrm security context * * Authorize deletion of a SPD entry. * * Return: Returns 0 if permission is granted. */ int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx) { return call_int_hook(xfrm_policy_delete_security, 0, ctx); } /** * security_xfrm_state_alloc() - Allocate a xfrm state LSM blob * @x: xfrm state being added to the SAD * @sec_ctx: security label provided by userspace * * Allocate a security structure to the @x->security field; the security field * is initialized to NULL when the xfrm_state is allocated. Set the context to * correspond to @sec_ctx. * * Return: Return 0 if operation was successful. */ int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) { return call_int_hook(xfrm_state_alloc, 0, x, sec_ctx); } EXPORT_SYMBOL(security_xfrm_state_alloc); /** * security_xfrm_state_alloc_acquire() - Allocate a xfrm state LSM blob * @x: xfrm state being added to the SAD * @polsec: associated policy's security context * @secid: secid from the flow * * Allocate a security structure to the x->security field; the security field * is initialized to NULL when the xfrm_state is allocated. Set the context to * correspond to secid. * * Return: Returns 0 if operation was successful. */ int security_xfrm_state_alloc_acquire(struct xfrm_state *x, struct xfrm_sec_ctx *polsec, u32 secid) { return call_int_hook(xfrm_state_alloc_acquire, 0, x, polsec, secid); } /** * security_xfrm_state_delete() - Check if deleting a xfrm state is allowed * @x: xfrm state * * Authorize deletion of x->security. * * Return: Returns 0 if permission is granted. */ int security_xfrm_state_delete(struct xfrm_state *x) { return call_int_hook(xfrm_state_delete_security, 0, x); } EXPORT_SYMBOL(security_xfrm_state_delete); /** * security_xfrm_state_free() - Free a xfrm state * @x: xfrm state * * Deallocate x->security. */ void security_xfrm_state_free(struct xfrm_state *x) { call_void_hook(xfrm_state_free_security, x); } /** * security_xfrm_policy_lookup() - Check if using a xfrm policy is allowed * @ctx: target xfrm security context * @fl_secid: flow secid used to authorize access * * Check permission when a flow selects a xfrm_policy for processing XFRMs on a * packet. The hook is called when selecting either a per-socket policy or a * generic xfrm policy. * * Return: Return 0 if permission is granted, -ESRCH otherwise, or -errno on * other errors. */ int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid) { return call_int_hook(xfrm_policy_lookup, 0, ctx, fl_secid); } /** * security_xfrm_state_pol_flow_match() - Check for a xfrm match * @x: xfrm state to match * @xp: xfrm policy to check for a match * @flic: flow to check for a match. * * Check @xp and @flic for a match with @x. * * Return: Returns 1 if there is a match. */ int security_xfrm_state_pol_flow_match(struct xfrm_state *x, struct xfrm_policy *xp, const struct flowi_common *flic) { struct security_hook_list *hp; int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match); /* * Since this function is expected to return 0 or 1, the judgment * becomes difficult if multiple LSMs supply this call. Fortunately, * we can use the first LSM's judgment because currently only SELinux * supplies this call. * * For speed optimization, we explicitly break the loop rather than * using the macro */ hlist_for_each_entry(hp, &security_hook_heads.xfrm_state_pol_flow_match, list) { rc = hp->hook.xfrm_state_pol_flow_match(x, xp, flic); break; } return rc; } /** * security_xfrm_decode_session() - Determine the xfrm secid for a packet * @skb: xfrm packet * @secid: secid * * Decode the packet in @skb and return the security label in @secid. * * Return: Return 0 if all xfrms used have the same secid. */ int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid) { return call_int_hook(xfrm_decode_session, 0, skb, secid, 1); } void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic) { int rc = call_int_hook(xfrm_decode_session, 0, skb, &flic->flowic_secid, 0); BUG_ON(rc); } EXPORT_SYMBOL(security_skb_classify_flow); #endif /* CONFIG_SECURITY_NETWORK_XFRM */ #ifdef CONFIG_KEYS /** * security_key_alloc() - Allocate and initialize a kernel key LSM blob * @key: key * @cred: credentials * @flags: allocation flags * * Permit allocation of a key and assign security data. Note that key does not * have a serial number assigned at this point. * * Return: Return 0 if permission is granted, -ve error otherwise. */ int security_key_alloc(struct key *key, const struct cred *cred, unsigned long flags) { return call_int_hook(key_alloc, 0, key, cred, flags); } /** * security_key_free() - Free a kernel key LSM blob * @key: key * * Notification of destruction; free security data. */ void security_key_free(struct key *key) { call_void_hook(key_free, key); } /** * security_key_permission() - Check if a kernel key operation is allowed * @key_ref: key reference * @cred: credentials of actor requesting access * @need_perm: requested permissions * * See whether a specific operational right is granted to a process on a key. * * Return: Return 0 if permission is granted, -ve error otherwise. */ int security_key_permission(key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm) { return call_int_hook(key_permission, 0, key_ref, cred, need_perm); } /** * security_key_getsecurity() - Get the key's security label * @key: key * @buffer: security label buffer * * Get a textual representation of the security context attached to a key for * the purposes of honouring KEYCTL_GETSECURITY. This function allocates the * storage for the NUL-terminated string and the caller should free it. * * Return: Returns the length of @buffer (including terminating NUL) or -ve if * an error occurs. May also return 0 (and a NULL buffer pointer) if * there is no security label assigned to the key. */ int security_key_getsecurity(struct key *key, char **buffer) { *buffer = NULL; return call_int_hook(key_getsecurity, 0, key, buffer); } #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT /** * security_audit_rule_init() - Allocate and init an LSM audit rule struct * @field: audit action * @op: rule operator * @rulestr: rule context * @lsmrule: receive buffer for audit rule struct * * Allocate and initialize an LSM audit rule structure. * * Return: Return 0 if @lsmrule has been successfully set, -EINVAL in case of * an invalid rule. */ int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule) { return call_int_hook(audit_rule_init, 0, field, op, rulestr, lsmrule); } /** * security_audit_rule_known() - Check if an audit rule contains LSM fields * @krule: audit rule * * Specifies whether given @krule contains any fields related to the current * LSM. * * Return: Returns 1 in case of relation found, 0 otherwise. */ int security_audit_rule_known(struct audit_krule *krule) { return call_int_hook(audit_rule_known, 0, krule); } /** * security_audit_rule_free() - Free an LSM audit rule struct * @lsmrule: audit rule struct * * Deallocate the LSM audit rule structure previously allocated by * audit_rule_init(). */ void security_audit_rule_free(void *lsmrule) { call_void_hook(audit_rule_free, lsmrule); } /** * security_audit_rule_match() - Check if a label matches an audit rule * @secid: security label * @field: LSM audit field * @op: matching operator * @lsmrule: audit rule * * Determine if given @secid matches a rule previously approved by * security_audit_rule_known(). * * Return: Returns 1 if secid matches the rule, 0 if it does not, -ERRNO on * failure. */ int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule) { return call_int_hook(audit_rule_match, 0, secid, field, op, lsmrule); } #endif /* CONFIG_AUDIT */ #ifdef CONFIG_BPF_SYSCALL /** * security_bpf() - Check if the bpf syscall operation is allowed * @cmd: command * @attr: bpf attribute * @size: size * * Do a initial check for all bpf syscalls after the attribute is copied into * the kernel. The actual security module can implement their own rules to * check the specific cmd they need. * * Return: Returns 0 if permission is granted. */ int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) { return call_int_hook(bpf, 0, cmd, attr, size); } /** * security_bpf_map() - Check if access to a bpf map is allowed * @map: bpf map * @fmode: mode * * Do a check when the kernel generates and returns a file descriptor for eBPF * maps. * * Return: Returns 0 if permission is granted. */ int security_bpf_map(struct bpf_map *map, fmode_t fmode) { return call_int_hook(bpf_map, 0, map, fmode); } /** * security_bpf_prog() - Check if access to a bpf program is allowed * @prog: bpf program * * Do a check when the kernel generates and returns a file descriptor for eBPF * programs. * * Return: Returns 0 if permission is granted. */ int security_bpf_prog(struct bpf_prog *prog) { return call_int_hook(bpf_prog, 0, prog); } /** * security_bpf_map_alloc() - Allocate a bpf map LSM blob * @map: bpf map * * Initialize the security field inside bpf map. * * Return: Returns 0 on success, error on failure. */ int security_bpf_map_alloc(struct bpf_map *map) { return call_int_hook(bpf_map_alloc_security, 0, map); } /** * security_bpf_prog_alloc() - Allocate a bpf program LSM blob * @aux: bpf program aux info struct * * Initialize the security field inside bpf program. * * Return: Returns 0 on success, error on failure. */ int security_bpf_prog_alloc(struct bpf_prog_aux *aux) { return call_int_hook(bpf_prog_alloc_security, 0, aux); } /** * security_bpf_map_free() - Free a bpf map's LSM blob * @map: bpf map * * Clean up the security information stored inside bpf map. */ void security_bpf_map_free(struct bpf_map *map) { call_void_hook(bpf_map_free_security, map); } /** * security_bpf_prog_free() - Free a bpf program's LSM blob * @aux: bpf program aux info struct * * Clean up the security information stored inside bpf prog. */ void security_bpf_prog_free(struct bpf_prog_aux *aux) { call_void_hook(bpf_prog_free_security, aux); } #endif /* CONFIG_BPF_SYSCALL */ /** * security_locked_down() - Check if a kernel feature is allowed * @what: requested kernel feature * * Determine whether a kernel feature that potentially enables arbitrary code * execution in kernel space should be permitted. * * Return: Returns 0 if permission is granted. */ int security_locked_down(enum lockdown_reason what) { return call_int_hook(locked_down, 0, what); } EXPORT_SYMBOL(security_locked_down); #ifdef CONFIG_PERF_EVENTS /** * security_perf_event_open() - Check if a perf event open is allowed * @attr: perf event attribute * @type: type of event * * Check whether the @type of perf_event_open syscall is allowed. * * Return: Returns 0 if permission is granted. */ int security_perf_event_open(struct perf_event_attr *attr, int type) { return call_int_hook(perf_event_open, 0, attr, type); } /** * security_perf_event_alloc() - Allocate a perf event LSM blob * @event: perf event * * Allocate and save perf_event security info. * * Return: Returns 0 on success, error on failure. */ int security_perf_event_alloc(struct perf_event *event) { return call_int_hook(perf_event_alloc, 0, event); } /** * security_perf_event_free() - Free a perf event LSM blob * @event: perf event * * Release (free) perf_event security info. */ void security_perf_event_free(struct perf_event *event) { call_void_hook(perf_event_free, event); } /** * security_perf_event_read() - Check if reading a perf event label is allowed * @event: perf event * * Read perf_event security info if allowed. * * Return: Returns 0 if permission is granted. */ int security_perf_event_read(struct perf_event *event) { return call_int_hook(perf_event_read, 0, event); } /** * security_perf_event_write() - Check if writing a perf event label is allowed * @event: perf event * * Write perf_event security info if allowed. * * Return: Returns 0 if permission is granted. */ int security_perf_event_write(struct perf_event *event) { return call_int_hook(perf_event_write, 0, event); } #endif /* CONFIG_PERF_EVENTS */ #ifdef CONFIG_IO_URING /** * security_uring_override_creds() - Check if overriding creds is allowed * @new: new credentials * * Check if the current task, executing an io_uring operation, is allowed to * override it's credentials with @new. * * Return: Returns 0 if permission is granted. */ int security_uring_override_creds(const struct cred *new) { return call_int_hook(uring_override_creds, 0, new); } /** * security_uring_sqpoll() - Check if IORING_SETUP_SQPOLL is allowed * * Check whether the current task is allowed to spawn a io_uring polling thread * (IORING_SETUP_SQPOLL). * * Return: Returns 0 if permission is granted. */ int security_uring_sqpoll(void) { return call_int_hook(uring_sqpoll, 0); } /** * security_uring_cmd() - Check if a io_uring passthrough command is allowed * @ioucmd: command * * Check whether the file_operations uring_cmd is allowed to run. * * Return: Returns 0 if permission is granted. */ int security_uring_cmd(struct io_uring_cmd *ioucmd) { return call_int_hook(uring_cmd, 0, ioucmd); } #endif /* CONFIG_IO_URING */
1 1463 1464 1463 1464 1464 1464 1 12 1422 1421 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 // SPDX-License-Identifier: GPL-2.0 /* * Out-of-line refcount functions. */ #include <linux/mutex.h> #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/bug.h> #define REFCOUNT_WARN(str) WARN_ONCE(1, "refcount_t: " str ".\n") void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t) { refcount_set(r, REFCOUNT_SATURATED); switch (t) { case REFCOUNT_ADD_NOT_ZERO_OVF: REFCOUNT_WARN("saturated; leaking memory"); break; case REFCOUNT_ADD_OVF: REFCOUNT_WARN("saturated; leaking memory"); break; case REFCOUNT_ADD_UAF: REFCOUNT_WARN("addition on 0; use-after-free"); break; case REFCOUNT_SUB_UAF: REFCOUNT_WARN("underflow; use-after-free"); break; case REFCOUNT_DEC_LEAK: REFCOUNT_WARN("decrement hit 0; leaking memory"); break; default: REFCOUNT_WARN("unknown saturation event!?"); } } EXPORT_SYMBOL(refcount_warn_saturate); /** * refcount_dec_if_one - decrement a refcount if it is 1 * @r: the refcount * * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the * success thereof. * * Like all decrement operations, it provides release memory order and provides * a control dependency. * * It can be used like a try-delete operator; this explicit case is provided * and not cmpxchg in generic, because that would allow implementing unsafe * operations. * * Return: true if the resulting refcount is 0, false otherwise */ bool refcount_dec_if_one(refcount_t *r) { int val = 1; return atomic_try_cmpxchg_release(&r->refs, &val, 0); } EXPORT_SYMBOL(refcount_dec_if_one); /** * refcount_dec_not_one - decrement a refcount if it is not 1 * @r: the refcount * * No atomic_t counterpart, it decrements unless the value is 1, in which case * it will return false. * * Was often done like: atomic_add_unless(&var, -1, 1) * * Return: true if the decrement operation was successful, false otherwise */ bool refcount_dec_not_one(refcount_t *r) { unsigned int new, val = atomic_read(&r->refs); do { if (unlikely(val == REFCOUNT_SATURATED)) return true; if (val == 1) return false; new = val - 1; if (new > val) { WARN_ONCE(new > val, "refcount_t: underflow; use-after-free.\n"); return true; } } while (!atomic_try_cmpxchg_release(&r->refs, &val, new)); return true; } EXPORT_SYMBOL(refcount_dec_not_one); /** * refcount_dec_and_mutex_lock - return holding mutex if able to decrement * refcount to 0 * @r: the refcount * @lock: the mutex to be locked * * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail * to decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. * * Return: true and hold mutex if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) { if (refcount_dec_not_one(r)) return false; mutex_lock(lock); if (!refcount_dec_and_test(r)) { mutex_unlock(lock); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_mutex_lock); /** * refcount_dec_and_lock - return holding spinlock if able to decrement * refcount to 0 * @r: the refcount * @lock: the spinlock to be locked * * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to * decrement when saturated at REFCOUNT_SATURATED. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. * * Return: true and hold spinlock if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) { if (refcount_dec_not_one(r)) return false; spin_lock(lock); if (!refcount_dec_and_test(r)) { spin_unlock(lock); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_lock); /** * refcount_dec_and_lock_irqsave - return holding spinlock with disabled * interrupts if able to decrement refcount to 0 * @r: the refcount * @lock: the spinlock to be locked * @flags: saved IRQ-flags if the is acquired * * Same as refcount_dec_and_lock() above except that the spinlock is acquired * with disabled interrupts. * * Return: true and hold spinlock if able to decrement refcount to 0, false * otherwise */ bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, unsigned long *flags) { if (refcount_dec_not_one(r)) return false; spin_lock_irqsave(lock, *flags); if (!refcount_dec_and_test(r)) { spin_unlock_irqrestore(lock, *flags); return false; } return true; } EXPORT_SYMBOL(refcount_dec_and_lock_irqsave);
2134 5028 1746 73 72 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_H #define _LINUX_SCHED_H /* * Define 'struct task_struct' and provide the main scheduler * APIs (schedule(), wakeup variants, etc.) */ #include <uapi/linux/sched.h> #include <asm/current.h> #include <linux/pid.h> #include <linux/sem.h> #include <linux/shm.h> #include <linux/kmsan_types.h> #include <linux/mutex.h> #include <linux/plist.h> #include <linux/hrtimer.h> #include <linux/irqflags.h> #include <linux/seccomp.h> #include <linux/nodemask.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <linux/resource.h> #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/sched/types.h> #include <linux/signal_types.h> #include <linux/syscall_user_dispatch.h> #include <linux/mm_types_task.h> #include <linux/task_io_accounting.h> #include <linux/posix-timers.h> #include <linux/rseq.h> #include <linux/seqlock.h> #include <linux/kcsan.h> #include <linux/rv.h> #include <linux/livepatch_sched.h> #include <asm/kmap_size.h> /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; struct bio_list; struct blk_plug; struct bpf_local_storage; struct bpf_run_ctx; struct capture_control; struct cfs_rq; struct fs_struct; struct futex_pi_state; struct io_context; struct io_uring_task; struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; struct robust_list_head; struct root_domain; struct rq; struct sched_attr; struct sched_param; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; struct user_event_mm; /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). * * We have two separate sets of flags: task->__state * is about runnability, while task->exit_state are * about the task exiting. Confusing, but this way * modifying one set can't modify the other one by * mistake. */ /* Used in tsk->__state: */ #define TASK_RUNNING 0x00000000 #define TASK_INTERRUPTIBLE 0x00000001 #define TASK_UNINTERRUPTIBLE 0x00000002 #define __TASK_STOPPED 0x00000004 #define __TASK_TRACED 0x00000008 /* Used in tsk->exit_state: */ #define EXIT_DEAD 0x00000010 #define EXIT_ZOMBIE 0x00000020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->__state again: */ #define TASK_PARKED 0x00000040 #define TASK_DEAD 0x00000080 #define TASK_WAKEKILL 0x00000100 #define TASK_WAKING 0x00000200 #define TASK_NOLOAD 0x00000400 #define TASK_NEW 0x00000800 #define TASK_RTLOCK_WAIT 0x00001000 #define TASK_FREEZABLE 0x00002000 #define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) #define TASK_FROZEN 0x00008000 #define TASK_STATE_MAX 0x00010000 #define TASK_ANY (TASK_STATE_MAX-1) /* * DO NOT ADD ANY NEW USERS ! */ #define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED __TASK_TRACED #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) /* Convenience macros for the sake of wake_up(): */ #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) #define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) #define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0) #define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0) /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). */ #define is_special_task_state(state) \ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP # define debug_normal_state_change(state_value) \ do { \ WARN_ON_ONCE(is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_special_state_change(state_value) \ do { \ WARN_ON_ONCE(!is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_set_state() \ do { \ current->saved_state_change = current->task_state_change;\ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_restore_state() \ do { \ current->task_state_change = current->saved_state_change;\ } while (0) #else # define debug_normal_state_change(cond) do { } while (0) # define debug_special_state_change(cond) do { } while (0) # define debug_rtlock_wait_set_state() do { } while (0) # define debug_rtlock_wait_restore_state() do { } while (0) #endif /* * set_current_state() includes a barrier so that the write of current->__state * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); * if (CONDITION) * break; * * schedule(); * } * __set_current_state(TASK_RUNNING); * * If the caller does not need such serialisation (because, for instance, the * CONDITION test and condition change and wakeup are under the same lock) then * use __set_current_state(). * * The above is typically ordered against the wakeup, which does: * * CONDITION = 1; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * * where wake_up_state()/try_to_wake_up() executes a full memory barrier before * accessing p->__state. * * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * * However, with slightly different timing the wakeup TASK_RUNNING store can * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not * a problem either because that will result in one extra go around the loop * and our @cond test will save the day. * * Also see the comments of try_to_wake_up(). */ #define __set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ WRITE_ONCE(current->__state, (state_value)); \ } while (0) #define set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ smp_store_mb(current->__state, (state_value)); \ } while (0) /* * set_special_state() should be used for those states when the blocking task * can not use the regular condition based wait-loop. In that case we must * serialize against wakeups such that any possible in-flight TASK_RUNNING * stores will not collide with our state change. */ #define set_special_state(state_value) \ do { \ unsigned long flags; /* may shadow */ \ \ raw_spin_lock_irqsave(&current->pi_lock, flags); \ debug_special_state_change((state_value)); \ WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(&current->pi_lock, flags); \ } while (0) /* * PREEMPT_RT specific variants for "sleeping" spin/rwlocks * * RT's spin/rwlock substitutions are state preserving. The state of the * task when blocking on the lock is saved in task_struct::saved_state and * restored after the lock has been acquired. These operations are * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT * lock related wakeups while the task is blocked on the lock are * redirected to operate on task_struct::saved_state to ensure that these * are not dropped. On restore task_struct::saved_state is set to * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail. * * The lock operation looks like this: * * current_save_and_set_rtlock_wait_state(); * for (;;) { * if (try_lock()) * break; * raw_spin_unlock_irq(&lock->wait_lock); * schedule_rtlock(); * raw_spin_lock_irq(&lock->wait_lock); * set_current_state(TASK_RTLOCK_WAIT); * } * current_restore_rtlock_saved_state(); */ #define current_save_and_set_rtlock_wait_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ current->saved_state = current->__state; \ debug_rtlock_wait_set_state(); \ WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define current_restore_rtlock_saved_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ debug_rtlock_wait_restore_state(); \ WRITE_ONCE(current->__state, current->saved_state); \ current->saved_state = TASK_RUNNING; \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define get_current_state() READ_ONCE(current->__state) /* * Define the task command name length as enum, then it can be visible to * BPF programs. */ enum { TASK_COMM_LEN = 16, }; extern void scheduler_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern long schedule_timeout(long timeout); extern long schedule_timeout_interruptible(long timeout); extern long schedule_timeout_killable(long timeout); extern long schedule_timeout_uninterruptible(long timeout); extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); asmlinkage void preempt_schedule_irq(void); #ifdef CONFIG_PREEMPT_RT extern void schedule_rtlock(void); #endif extern int __must_check io_schedule_prepare(void); extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode * @stime: time spent in system mode * @lock: protects the above two fields * * Stores previous user/system time values such that we can guarantee * monotonicity. */ struct prev_cputime { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE u64 utime; u64 stime; raw_spinlock_t lock; #endif }; enum vtime_state { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, /* Task is idle */ VTIME_IDLE, /* Task runs in kernelspace in a CPU with VTIME active: */ VTIME_SYS, /* Task runs in userspace in a CPU with VTIME active: */ VTIME_USER, /* Task runs as guests in a CPU with VTIME active: */ VTIME_GUEST, }; struct vtime { seqcount_t seqcount; unsigned long long starttime; enum vtime_state state; unsigned int cpu; u64 utime; u64 stime; u64 gtime; }; /* * Utilization clamp constraints. * @UCLAMP_MIN: Minimum utilization * @UCLAMP_MAX: Maximum utilization * @UCLAMP_CNT: Utilization clamp constraints count */ enum uclamp_id { UCLAMP_MIN = 0, UCLAMP_MAX, UCLAMP_CNT }; #ifdef CONFIG_SMP extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; #endif struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ /* # of times we have run on this CPU: */ unsigned long pcount; /* Time spent waiting on a runqueue: */ unsigned long long run_delay; /* Timestamps: */ /* When did we last run on a CPU? */ unsigned long long last_arrival; /* When were we last queued to run? */ unsigned long long last_queued; #endif /* CONFIG_SCHED_INFO */ }; /* * Integer metrics need fixed point arithmetic, e.g., sched/fair * has a few: load, load_avg, util_avg, freq, and capacity. * * We define a basic fixed point arithmetic range, and then formalize * all these metrics based on that basic range. */ # define SCHED_FIXEDPOINT_SHIFT 10 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) /* Increase resolution of cpu_capacity calculations */ # define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT # define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) struct load_weight { unsigned long weight; u32 inv_weight; }; /** * struct util_est - Estimation utilization of FAIR tasks * @enqueued: instantaneous estimated utilization of a task/cpu * @ewma: the Exponential Weighted Moving Average (EWMA) * utilization of a task * * Support data structure to track an Exponential Weighted Moving Average * (EWMA) of a FAIR task's utilization. New samples are added to the moving * average each time a task completes an activation. Sample's weight is chosen * so that the EWMA will be relatively insensitive to transient changes to the * task's workload. * * The enqueued attribute has a slightly different meaning for tasks and cpus: * - task: the task's util_avg at last task dequeue time * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU * Thus, the util_est.enqueued of a task represents the contribution on the * estimated utilization of the CPU where that task is currently enqueued. * * Only for tasks we track a moving average of the past instantaneous * estimated utilization. This allows to absorb sporadic drops in utilization * of an otherwise almost periodic task. * * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg * updates. When a task is dequeued, its util_est should not be updated if its * util_avg has not been updated in the meantime. * This information is mapped into the MSB bit of util_est.enqueued at dequeue * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg * for a task) it is safe to use MSB. */ struct util_est { unsigned int enqueued; unsigned int ewma; #define UTIL_EST_WEIGHT_SHIFT 2 #define UTIL_AVG_UNCHANGED 0x80000000 } __attribute__((__aligned__(sizeof(u64)))); /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * * [load_avg definition] * * load_avg = runnable% * scale_load_down(load) * * [runnable_avg definition] * * runnable_avg = runnable% * SCHED_CAPACITY_SCALE * * [util_avg definition] * * util_avg = running% * SCHED_CAPACITY_SCALE * * where runnable% is the time ratio that a sched_entity is runnable and * running% the time ratio that a sched_entity is running. * * For cfs_rq, they are the aggregated values of all runnable and blocked * sched_entities. * * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU * capacity scaling. The scaling is done through the rq_clock_pelt that is used * for computing those signals (see update_rq_clock_pelt()) * * N.B., the above ratios (runnable% and running%) themselves are in the * range of [0, 1]. To do fixed point arithmetics, we therefore scale them * to as large a range as necessary. This is for example reflected by * util_avg's SCHED_CAPACITY_SCALE. * * [Overflow issue] * * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities * with the highest load (=88761), always runnable on a single cfs_rq, * and should not overflow as the number already hits PID_MAX_LIMIT. * * For all other cases (including 32-bit kernels), struct load_weight's * weight will overflow first before we do, because: * * Max(load_avg) <= Max(load.weight) * * Then it is the load_weight's responsibility to consider overflow * issues. */ struct sched_avg { u64 last_update_time; u64 load_sum; u64 runnable_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; struct util_est util_est; } ____cacheline_aligned; struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; u64 wait_count; u64 wait_sum; u64 iowait_count; u64 iowait_sum; u64 sleep_start; u64 sleep_max; s64 sum_sleep_runtime; u64 block_start; u64 block_max; s64 sum_block_runtime; u64 exec_max; u64 slice_max; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; u64 nr_wakeups; u64 nr_wakeups_sync; u64 nr_wakeups_migrate; u64 nr_wakeups_local; u64 nr_wakeups_remote; u64 nr_wakeups_affine; u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; #ifdef CONFIG_SCHED_CORE u64 core_forceidle_sum; #endif #endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned; struct sched_entity { /* For load-balancing: */ struct load_weight load; struct rb_node run_node; u64 deadline; u64 min_deadline; struct list_head group_node; unsigned int on_rq; u64 exec_start; u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; s64 vlag; u64 slice; u64 nr_migrations; #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; /* rq on which this entity is (to be) queued: */ struct cfs_rq *cfs_rq; /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; /* cached value of my_q->h_nr_running */ unsigned long runnable_weight; #endif #ifdef CONFIG_SMP /* * Per entity load average tracking. * * Put into separate cache line so it does not * collide with read-mostly values above. */ struct sched_avg avg; #endif }; struct sched_rt_entity { struct list_head run_list; unsigned long timeout; unsigned long watchdog_stamp; unsigned int time_slice; unsigned short on_rq; unsigned short on_list; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity *parent; /* rq on which this entity is (to be) queued: */ struct rt_rq *rt_rq; /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif } __randomize_layout; struct sched_dl_entity { struct rb_node rb_node; /* * Original scheduling parameters. Copied here from sched_attr * during sched_setattr(), they will remain the same until * the next sched_setattr(). */ u64 dl_runtime; /* Maximum runtime for each instance */ u64 dl_deadline; /* Relative deadline of each instance */ u64 dl_period; /* Separation of two instances (period) */ u64 dl_bw; /* dl_runtime / dl_period */ u64 dl_density; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, * they are continuously updated during task execution. Note that * the remaining runtime could be < 0 in case we are in overrun. */ s64 runtime; /* Remaining runtime for this instance */ u64 deadline; /* Absolute deadline for this instance */ unsigned int flags; /* Specifying the scheduler behaviour */ /* * Some bool flags: * * @dl_throttled tells if we exhausted the runtime. If so, the * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. * * @dl_non_contending tells if the task is inactive while still * contributing to the active utilization. In other words, it * indicates if the inactive timer has been armed and its handler * has not been executed yet. This flag is useful to avoid race * conditions between the inactive timer handler and the wakeup * code. * * @dl_overrun tells if the task asked to be informed about runtime * overruns. */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; /* * Bandwidth enforcement timer. Each -deadline task has its * own bandwidth to be enforced, thus we need one timer per task. */ struct hrtimer dl_timer; /* * Inactive timer, responsible for decreasing the active utilization * at the "0-lag time". When a -deadline task blocks, it contributes * to GRUB's active utilization until the "0-lag time", hence a * timer is needed to decrease the active utilization at the correct * time. */ struct hrtimer inactive_timer; #ifdef CONFIG_RT_MUTEXES /* * Priority Inheritance. When a DEADLINE scheduling entity is boosted * pi_se points to the donor, otherwise points to the dl_se it belongs * to (the original one/itself). */ struct sched_dl_entity *pi_se; #endif }; #ifdef CONFIG_UCLAMP_TASK /* Number of utilization clamp buckets (shorter alias) */ #define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT /* * Utilization clamp for a scheduling entity * @value: clamp value "assigned" to a se * @bucket_id: bucket index corresponding to the "assigned" value * @active: the se is currently refcounted in a rq's bucket * @user_defined: the requested clamp value comes from user-space * * The bucket_id is the index of the clamp bucket matching the clamp value * which is pre-computed and stored to avoid expensive integer divisions from * the fast path. * * The active bit is set whenever a task has got an "effective" value assigned, * which can be different from the clamp value "requested" from user-space. * This allows to know a task is refcounted in the rq's bucket corresponding * to the "effective" bucket_id. * * The user_defined bit is set whenever a task has got a task-specific clamp * value requested from userspace, i.e. the system defaults apply to this task * just as a restriction. This allows to relax default clamps when a less * restrictive task-specific value has been requested, thus allowing to * implement a "nice" semantic. For example, a task running with a 20% * default boost can still drop its own boosting to 0%. */ struct uclamp_se { unsigned int value : bits_per(SCHED_CAPACITY_SCALE); unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); unsigned int active : 1; unsigned int user_defined : 1; }; #endif /* CONFIG_UCLAMP_TASK */ union rcu_special { struct { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ u8 need_mb; /* Readers need smp_mb(). */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; enum perf_event_task_context { perf_invalid_context = -1, perf_hw_context = 0, perf_sw_context, perf_nr_task_contexts, }; struct wake_q_node { struct wake_q_node *next; }; struct kmap_ctrl { #ifdef CONFIG_KMAP_LOCAL int idx; pte_t pteval[KM_MAX_IDX]; #endif }; struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For reasons of header soup (see current_thread_info()), this * must be the first element of task_struct. */ struct thread_info thread_info; #endif unsigned int __state; #ifdef CONFIG_PREEMPT_RT /* saved state for "spinlock sleepers" */ unsigned int saved_state; #endif /* * This begins the randomizable portion of task_struct. Only * scheduling-critical items should be added above here. */ randomized_struct_fields_start void *stack; refcount_t usage; /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; #ifdef CONFIG_SMP int on_cpu; struct __call_single_node wake_entry; unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; /* * recent_used_cpu is initially set as the last CPU used by a task * that wakes affine another task. Waker/wakee relationships can * push tasks around a CPU where each wakeup moves to the next one. * Tracking a recently used CPU allows a quick search for a recently * used CPU that may be idle. */ int recent_used_cpu; int wake_cpu; #endif int on_rq; int prio; int static_prio; int normal_prio; unsigned int rt_priority; struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE struct rb_node core_node; unsigned long core_cookie; unsigned int core_occupation; #endif #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp_req[UCLAMP_CNT]; /* * Effective clamp values used for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif struct sched_statistics stats; #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif unsigned int policy; int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; #ifdef CONFIG_SMP unsigned short migration_disabled; #endif unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; u8 rcu_tasks_holdout; u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU int trc_reader_nesting; int trc_ipi_to_cpu; union rcu_special trc_reader_special; struct list_head trc_holdout_list; struct list_head trc_blkd_node; int trc_blkd_cpu; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ struct sched_info sched_info; struct list_head tasks; #ifdef CONFIG_SMP struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; #endif struct mm_struct *mm; struct mm_struct *active_mm; int exit_state; int exit_code; int exit_signal; /* The signal sent when the parent dies: */ int pdeath_signal; /* JOBCTL_*, siglock protected: */ unsigned long jobctl; /* Used for emulating ABI behavior of previous Linux versions: */ unsigned int personality; /* Scheduler bits, serialized by scheduler locks: */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; /* Force alignment to the next boundary: */ unsigned :0; /* Unserialized, strictly 'current' */ /* * This field must not be in the scheduler word above due to wakelist * queueing no longer being serialized by p->on_cpu. However: * * p->XXX = X; ttwu() * schedule() if (p->on_rq && ..) // false * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true * deactivate_task() ttwu_queue_wakelist()) * p->on_rq = 0; p->sched_remote_wakeup = Y; * * guarantees all stores of 'current' are visible before * ->sched_remote_wakeup gets used, so it can be in this word. */ unsigned sched_remote_wakeup:1; /* Bit to tell LSMs we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif #ifdef CONFIG_MEMCG unsigned in_user_fault:1; #endif #ifdef CONFIG_LRU_GEN /* whether the LRU algorithm may apply to this access */ unsigned in_lru_fault:1; #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; /* task is frozen/stopped (used by the cgroup freezer) */ unsigned frozen:1; #endif #ifdef CONFIG_BLK_CGROUP unsigned use_memdelay:1; #endif #ifdef CONFIG_PSI /* Stalled due to lack of memory */ unsigned in_memstall:1; #endif #ifdef CONFIG_PAGE_OWNER /* Used by page_owner=on to detect recursion in page tracking. */ unsigned in_page_owner:1; #endif #ifdef CONFIG_EVENTFD /* Recursion prevention for eventfd_signal() */ unsigned in_eventfd:1; #endif #ifdef CONFIG_IOMMU_SVA unsigned pasid_activated:1; #endif #ifdef CONFIG_CPU_SUP_INTEL unsigned reported_split_lock:1; #endif #ifdef CONFIG_TASK_DELAY_ACCT /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; pid_t pid; pid_t tgid; #ifdef CONFIG_STACKPROTECTOR /* Canary value for the -fstack-protector GCC feature: */ unsigned long stack_canary; #endif /* * Pointers to the (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ /* Real parent process: */ struct task_struct __rcu *real_parent; /* Recipient of SIGCHLD, wait4() reports: */ struct task_struct __rcu *parent; /* * Children/sibling form the list of natural children: */ struct list_head children; struct list_head sibling; struct task_struct *group_leader; /* * 'ptraced' is the list of tasks this task is using ptrace() on. * * This includes both natural children and PTRACE_ATTACH targets. * 'ptrace_entry' is this task's link on the p->parent->ptraced list. */ struct list_head ptraced; struct list_head ptrace_entry; /* PID/PID hash table linkage. */ struct pid *thread_pid; struct hlist_node pid_links[PIDTYPE_MAX]; struct list_head thread_group; struct list_head thread_node; struct completion *vfork_done; /* CLONE_CHILD_SETTID: */ int __user *set_child_tid; /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; /* PF_KTHREAD | PF_IO_WORKER */ void *worker_private; u64 utime; u64 stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME u64 utimescaled; u64 stimescaled; #endif u64 gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN struct vtime vtime; #endif #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif /* Context switch counts: */ unsigned long nvcsw; unsigned long nivcsw; /* Monotonic time in nsecs: */ u64 start_time; /* Boot based time in nsecs: */ u64 start_boottime; /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; unsigned long maj_flt; /* Empty if CONFIG_POSIX_CPUTIMERS=n */ struct posix_cputimers posix_cputimers; #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK struct posix_cputimers_work posix_cputimers_work; #endif /* Process credentials: */ /* Tracer's credentials at attach: */ const struct cred __rcu *ptracer_cred; /* Objective and real subjective task credentials (COW): */ const struct cred __rcu *real_cred; /* Effective (overridable) subjective task credentials (COW): */ const struct cred __rcu *cred; #ifdef CONFIG_KEYS /* Cached requested key. */ struct key *cached_requested_key; #endif /* * executable name, excluding path. * * - normally initialized setup_new_exec() * - access it with [gs]et_task_comm() * - lock it with task_lock() */ char comm[TASK_COMM_LEN]; struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC struct sysv_sem sysvsem; struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; #endif /* Filesystem information: */ struct fs_struct *fs; /* Open file information: */ struct files_struct *files; #ifdef CONFIG_IO_URING struct io_uring_task *io_uring; #endif /* Namespaces: */ struct nsproxy *nsproxy; /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; struct callback_head *task_works; #ifdef CONFIG_AUDIT #ifdef CONFIG_AUDITSYSCALL struct audit_context *audit_context; #endif kuid_t loginuid; unsigned int sessionid; #endif struct seccomp seccomp; struct syscall_user_dispatch syscall_dispatch; /* Thread group tracking: */ u64 parent_exec_id; u64 self_exec_id; /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ spinlock_t alloc_lock; /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; struct wake_q_node wake_q; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ struct rb_root_cached pi_waiters; /* Updated under owner's pi_lock and rq lock */ struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ struct rt_mutex_waiter *pi_blocked_on; #endif #ifdef CONFIG_DEBUG_MUTEXES /* Mutex deadlock detection: */ struct mutex_waiter *blocked_on; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP int non_block_count; #endif #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events irqtrace; unsigned int hardirq_threaded; u64 hardirq_chain_key; int softirqs_enabled; int softirq_context; int irq_config; #endif #ifdef CONFIG_PREEMPT_RT int softirq_disable_cnt; #endif #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif #if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) unsigned int in_ubsan; #endif /* Journalling filesystem info: */ void *journal_info; /* Stacked block device info: */ struct bio_list *bio_list; /* Stack plugging: */ struct blk_plug *plug; /* VM state: */ struct reclaim_state *reclaim_state; struct io_context *io_context; #ifdef CONFIG_COMPACTION struct capture_control *capture_control; #endif /* Ptrace state: */ unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; struct task_io_accounting ioac; #ifdef CONFIG_PSI /* Pressure stall state */ unsigned int psi_flags; #endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; /* Accumulated virtual memory usage: */ u64 acct_vm_mem1; /* stime + utime since last update: */ u64 acct_timexpd; #endif #ifdef CONFIG_CPUSETS /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; /* Sequence number to catch updates: */ seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; int cpuset_slab_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; #endif #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; struct mutex futex_exit_mutex; unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; #endif #ifdef CONFIG_NUMA /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; unsigned long numa_migrate_retry; /* Migration stamp: */ u64 node_stamp; u64 last_task_numa_placement; u64 last_sum_exec_runtime; struct callback_head numa_work; /* * This pointer is only modified for current in syscall and * pagefault context (and for tasks being destroyed), so it can be read * from any of the following contexts: * - RCU read-side critical section * - current->numa_group from everywhere * - task's runqueue locked, task not running */ struct numa_group __rcu *numa_group; /* * numa_faults is an array split into four regions: * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer * in this precise order. * * faults_memory: Exponential decaying average of faults on a per-node * basis. Scheduling placement decisions are made based on these * counts. The values remain static for the duration of a PTE scan. * faults_cpu: Track the nodes the process was running on when a NUMA * hinting fault was incurred. * faults_memory_buffer and faults_cpu_buffer: Record faults per node * during the current scan window. When the scan completes, the counts * in faults_memory and faults_cpu decay and these values are copied. */ unsigned long *numa_faults; unsigned long total_numa_faults; /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local or failed to migrate. The task scan * period is adapted based on the locality of the faults with different * weights depending on whether they were shared or private faults */ unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_RSEQ struct rseq __user *rseq; u32 rseq_len; u32 rseq_sig; /* * RmW on rseq_event_mask must be performed atomically * with respect to preemption. */ unsigned long rseq_event_mask; #endif #ifdef CONFIG_SCHED_MM_CID int mm_cid; /* Current cid in mm */ int last_mm_cid; /* Most recent cid in mm */ int migrate_from_cpu; int mm_cid_active; /* Whether cid bitmap is active */ struct callback_head cid_work; #endif struct tlbflush_unmap_batch tlb_ubc; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; struct page_frag task_frag; #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif #ifdef CONFIG_FAULT_INJECTION int make_it_fail; unsigned int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call * balance_dirty_pages() for a dirty throttling pause: */ int nr_dirtied; int nr_dirtied_pause; /* Start of a write-and-pause period: */ unsigned long dirty_paused_when; #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif /* * Time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ u64 timer_slack_ns; u64 default_timer_slack_ns; #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) unsigned int kasan_depth; #endif #ifdef CONFIG_KCSAN struct kcsan_ctx kcsan_ctx; #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events kcsan_save_irqtrace; #endif #ifdef CONFIG_KCSAN_WEAK_MEMORY int kcsan_stack_depth; #endif #endif #ifdef CONFIG_KMSAN struct kmsan_ctx kmsan_ctx; #endif #if IS_ENABLED(CONFIG_KUNIT) struct kunit *kunit_test; #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; int curr_ret_depth; /* Stack of return addresses for return function tracing: */ struct ftrace_ret_stack *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; /* * Number of functions that haven't been traced * because of depth overrun: */ atomic_t trace_overrun; /* Pause tracing: */ atomic_t tracing_graph_pause; #endif #ifdef CONFIG_TRACING /* Bitmask and counter of trace recursion: */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_KCOV /* See kernel/kcov.c for more details. */ /* Coverage collection mode enabled for this task (0 if disabled): */ unsigned int kcov_mode; /* Size of the kcov_area: */ unsigned int kcov_size; /* Buffer for coverage collection: */ void *kcov_area; /* KCOV descriptor wired with this task or NULL: */ struct kcov *kcov; /* KCOV common handle for remote coverage collection: */ u64 kcov_handle; /* KCOV sequence number: */ int kcov_sequence; /* Collect coverage from softirq context: */ unsigned int kcov_softirq; #endif #ifdef CONFIG_MEMCG struct mem_cgroup *memcg_in_oom; gfp_t memcg_oom_gfp_mask; int memcg_oom_order; /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; #endif #ifdef CONFIG_BLK_CGROUP struct gendisk *throttle_disk; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) unsigned int sequential_io; unsigned int sequential_io_avg; #endif struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; # ifdef CONFIG_PREEMPT_RT unsigned long saved_state_change; # endif #endif struct rcu_head rcu; refcount_t rcu_users; int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ refcount_t stack_refcount; #endif #ifdef CONFIG_LIVEPATCH int patch_state; #endif #ifdef CONFIG_SECURITY /* Used by LSM modules for access restriction: */ void *security; #endif #ifdef CONFIG_BPF_SYSCALL /* Used by BPF task local storage */ struct bpf_local_storage __rcu *bpf_storage; /* Used for BPF run context */ struct bpf_run_ctx *bpf_ctx; #endif #ifdef CONFIG_GCC_PLUGIN_STACKLEAK unsigned long lowest_stack; unsigned long prev_lowest_stack; #endif #ifdef CONFIG_X86_MCE void __user *mce_vaddr; __u64 mce_kflags; u64 mce_addr; __u64 mce_ripv : 1, mce_whole_page : 1, __mce_reserved : 62; struct callback_head mce_kill_me; int mce_count; #endif #ifdef CONFIG_KRETPROBES struct llist_head kretprobe_instances; #endif #ifdef CONFIG_RETHOOK struct llist_head rethooks; #endif #ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH /* * If L1D flush is supported on mm context switch * then we use this callback head to queue kill work * to kill tasks that are not running on SMT disabled * cores */ struct callback_head l1d_flush_kill; #endif #ifdef CONFIG_RV /* * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS. * If we find justification for more monitors, we can think * about adding more or developing a dynamic method. So far, * none of these are justified. */ union rv_task_monitor rv[RV_PER_TASK_MONITORS]; #endif #ifdef CONFIG_USER_EVENTS struct user_event_mm *user_event_mm; #endif /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. */ randomized_struct_fields_end /* CPU-specific state of this task: */ struct thread_struct thread; /* * WARNING: on x86, 'thread_struct' contains a variable-sized * structure. It *MUST* be at the end of 'task_struct'. * * Do not put anything below here! */ }; static inline struct pid *task_pid(struct task_struct *task) { return task->thread_pid; } /* * the helpers to get the task's different pids as they are seen * from various namespaces * * task_xid_nr() : global id, i.e. the id seen from the init namespace; * task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of * current. * task_xid_nr_ns() : id seen from the ns specified; * * see also pid_nr() etc in include/linux/pid.h */ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns); static inline pid_t task_pid_nr(struct task_struct *tsk) { return tsk->pid; } static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns); } static inline pid_t task_pid_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL); } static inline pid_t task_tgid_nr(struct task_struct *tsk) { return tsk->tgid; } /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. * * Test if a process is not yet dead (at most zombie state) * If pid_alive fails, then pointers within the task structure * can be stale and must not be dereferenced. * * Return: 1 if the process is alive. 0 otherwise. */ static inline int pid_alive(const struct task_struct *p) { return p->thread_pid != NULL; } static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); } static inline pid_t task_pgrp_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL); } static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns); } static inline pid_t task_session_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns); } static inline pid_t task_tgid_vnr(struct task_struct *tsk) { return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL); } static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) { pid_t pid = 0; rcu_read_lock(); if (pid_alive(tsk)) pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); rcu_read_unlock(); return pid; } static inline pid_t task_ppid_nr(const struct task_struct *tsk) { return task_ppid_nr_ns(tsk, &init_pid_ns); } /* Obsolete, do not use: */ static inline pid_t task_pgrp_nr(struct task_struct *tsk) { return task_pgrp_nr_ns(tsk, &init_pid_ns); } #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) static inline unsigned int __task_state_index(unsigned int tsk_state, unsigned int tsk_exit_state) { unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT; BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); if ((tsk_state & TASK_IDLE) == TASK_IDLE) state = TASK_REPORT_IDLE; /* * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. */ if (tsk_state & TASK_RTLOCK_WAIT) state = TASK_UNINTERRUPTIBLE; return fls(state); } static inline unsigned int task_state_index(struct task_struct *tsk) { return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); } static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != sizeof(state_char) - 1); return state_char[state]; } static inline char task_state_to_char(struct task_struct *tsk) { return task_index_to_char(task_state_index(tsk)); } /** * is_global_init - check if a task structure is init. Since init * is free to have sub-threads we need to check tgid. * @tsk: Task structure to be checked. * * Check if a task structure is the first user space task the kernel created. * * Return: 1 if the task structure is init. 0 otherwise. */ static inline int is_global_init(struct task_struct *tsk) { return task_tgid_nr(tsk) == 1; } extern struct pid *cad_pid; /* * Per process flags */ #define PF_VCPU 0x00000001 /* I'm a virtual CPU */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ #define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */ #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF__HOLE__00010000 0x00010000 #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF__HOLE__00800000 0x00800000 #define PF__HOLE__01000000 0x01000000 #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ #define PF__HOLE__20000000 0x20000000 #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example * with tsk_used_math (like during threaded core dumping). * There is however an exception to this rule during ptrace * or during fork: the ptracer task is allowed to write to the * child->flags of its traced child (same goes for fork, the parent * can write to the child->flags), because we're guaranteed the * child is not running and in turn not changing child->flags * at the same time the parent does it. */ #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) #define clear_used_math() clear_stopped_child_used_math(current) #define set_used_math() set_stopped_child_used_math(current) #define conditional_stopped_child_used_math(condition, child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) #define conditional_used_math(condition) conditional_stopped_child_used_math(condition, current) #define copy_to_stopped_child_used_math(child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) static __always_inline bool is_percpu_thread(void) { #ifdef CONFIG_SMP return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); #else return true; #endif } /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ #define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ #define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ { return test_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_SET(name, func) \ static inline void task_set_##func(struct task_struct *p) \ { set_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_CLEAR(name, func) \ static inline void task_clear_##func(struct task_struct *p) \ { clear_bit(PFA_##name, &p->atomic_flags); } TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs) TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs) TASK_PFA_TEST(SPREAD_PAGE, spread_page) TASK_PFA_SET(SPREAD_PAGE, spread_page) TASK_PFA_CLEAR(SPREAD_PAGE, spread_page) TASK_PFA_TEST(SPREAD_SLAB, spread_slab) TASK_PFA_SET(SPREAD_SLAB, spread_slab) TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) static inline void current_restore_flags(unsigned long orig_flags, unsigned long flags) { current->flags &= ~flags; current->flags |= orig_flags & flags; } extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); #ifdef CONFIG_SMP /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); /** * set_cpus_allowed_ptr - set CPU affinity mask of a task * @p: the task * @new_mask: CPU affinity mask * * Return: zero if successful, or a negative error code */ extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); extern void release_user_cpus_ptr(struct task_struct *p); extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); #else static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { } static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { if (!cpumask_test_cpu(0, new_mask)) return -EINVAL; return 0; } static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) { if (src->user_cpus_ptr) return -EINVAL; return 0; } static inline void release_user_cpus_ptr(struct task_struct *p) { WARN_ON(p->user_cpus_ptr); } static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { return 0; } #endif extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); /** * task_nice - return the nice value of a given task. * @p: the task in question. * * Return: The nice value [ -20 ... 0 ... 19 ]. */ static inline int task_nice(const struct task_struct *p) { return PRIO_TO_NICE((p)->static_prio); } extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); extern void sched_set_fifo_low(struct task_struct *p); extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? * @p: the task in question. * * Return: 1 if @p is an idle task. 0 otherwise. */ static __always_inline bool is_idle_task(const struct task_struct *p) { return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { #ifndef CONFIG_ARCH_TASK_STRUCT_ON_STACK struct task_struct task; #endif #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; #ifndef CONFIG_THREAD_INFO_IN_TASK extern struct thread_info init_thread_info; #endif extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_THREAD_INFO_IN_TASK # define task_thread_info(task) (&(task)->thread_info) #elif !defined(__HAVE_THREAD_FUNCTIONS) # define task_thread_info(task) ((struct thread_info *)(task)->stack) #endif /* * find a task by one of its numerical ids * * find_task_by_pid_ns(): * finds a task by its pid in the specified namespace * find_task_by_vpid(): * finds a task by its virtual pid * * see also find_vpid() etc in include/linux/pid.h */ extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); /* * find a task by its virtual pid and get the task struct */ extern struct task_struct *find_get_task_by_vpid(pid_t nr); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); #ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); #else static inline void kick_process(struct task_struct *tsk) { } #endif extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); static inline void set_task_comm(struct task_struct *tsk, const char *from) { __set_task_comm(tsk, from, false); } extern char *__get_task_comm(char *to, size_t len, struct task_struct *tsk); #define get_task_comm(buf, tsk) ({ \ BUILD_BUG_ON(sizeof(buf) != TASK_COMM_LEN); \ __get_task_comm(buf, sizeof(buf), tsk); \ }) #ifdef CONFIG_SMP static __always_inline void scheduler_ipi(void) { /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send * this IPI. */ preempt_fold_need_resched(); } #else static inline void scheduler_ipi(void) { } #endif extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); /* * Set thread flags in other task's structures. * See asm/thread_info.h for TIF_xxxx flags available: */ static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) { set_ti_thread_flag(task_thread_info(tsk), flag); } static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) { clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, bool value) { update_ti_thread_flag(task_thread_info(tsk), flag, value); } static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_ti_thread_flag(task_thread_info(tsk), flag); } static inline void set_tsk_need_resched(struct task_struct *tsk) { set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline void clear_tsk_need_resched(struct task_struct *tsk) { clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline int test_tsk_need_resched(struct task_struct *tsk) { return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) extern int __cond_resched(void); #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) void sched_dynamic_klp_enable(void); void sched_dynamic_klp_disable(void); DECLARE_STATIC_CALL(cond_resched, __cond_resched); static __always_inline int _cond_resched(void) { return static_call_mod(cond_resched)(); } #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) extern int dynamic_cond_resched(void); static __always_inline int _cond_resched(void) { return dynamic_cond_resched(); } #else /* !CONFIG_PREEMPTION */ static inline int _cond_resched(void) { klp_sched_try_switch(); return __cond_resched(); } #endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ #else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ static inline int _cond_resched(void) { klp_sched_try_switch(); return 0; } #endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */ #define cond_resched() ({ \ __might_resched(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) extern int __cond_resched_lock(spinlock_t *lock); extern int __cond_resched_rwlock_read(rwlock_t *lock); extern int __cond_resched_rwlock_write(rwlock_t *lock); #define MIGHT_RESCHED_RCU_SHIFT 8 #define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) #ifndef CONFIG_PREEMPT_RT /* * Non RT kernels have an elevated preempt count due to the held lock, * but are not allowed to be inside a RCU read side critical section */ # define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET #else /* * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in * cond_resched*lock() has to take that into account because it checks for * preempt_count() and rcu_preempt_depth(). */ # define PREEMPT_LOCK_RESCHED_OFFSETS \ (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) #endif #define cond_resched_lock(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_lock(lock); \ }) #define cond_resched_rwlock_read(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_read(lock); \ }) #define cond_resched_rwlock_write(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_write(lock); \ }) static inline void cond_resched_rcu(void) { #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) rcu_read_unlock(); cond_resched(); rcu_read_lock(); #endif } #ifdef CONFIG_PREEMPT_DYNAMIC extern bool preempt_model_none(void); extern bool preempt_model_voluntary(void); extern bool preempt_model_full(void); #else static inline bool preempt_model_none(void) { return IS_ENABLED(CONFIG_PREEMPT_NONE); } static inline bool preempt_model_voluntary(void) { return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY); } static inline bool preempt_model_full(void) { return IS_ENABLED(CONFIG_PREEMPT); } #endif static inline bool preempt_model_rt(void) { return IS_ENABLED(CONFIG_PREEMPT_RT); } /* * Does the preemption model allow non-cooperative preemption? * * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the * PREEMPT_NONE model. */ static inline bool preempt_model_preemptible(void) { return preempt_model_full() || preempt_model_rt(); } /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPTION, * but a general need for low latency) */ static inline int spin_needbreak(spinlock_t *lock) { #ifdef CONFIG_PREEMPTION return spin_is_contended(lock); #else return 0; #endif } /* * Check if a rwlock is contended. * Returns non-zero if there is another task waiting on the rwlock. * Returns zero if the lock is not contended or the system / underlying * rwlock implementation does not support contention detection. * Technically does not depend on CONFIG_PREEMPTION, but a general need * for low latency. */ static inline int rwlock_needbreak(rwlock_t *lock) { #ifdef CONFIG_PREEMPTION return rwlock_is_contended(lock); #else return 0; #endif } static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); } /* * Wrappers for p->thread_info->cpu access. No-op on UP. */ #ifdef CONFIG_SMP static inline unsigned int task_cpu(const struct task_struct *p) { return READ_ONCE(task_thread_info(p)->cpu); } extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #else static inline unsigned int task_cpu(const struct task_struct *p) { return 0; } static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) { } #endif /* CONFIG_SMP */ extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. * * This allows us to terminate optimistic spin loops and block, analogous to * the native optimistic spin heuristic of testing if the lock owner task is * running or not. */ #ifndef vcpu_is_preempted static inline bool vcpu_is_preempted(int cpu) { return false; } #endif extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #ifndef TASK_SIZE_OF #define TASK_SIZE_OF(tsk) TASK_SIZE #endif #ifdef CONFIG_SMP static inline bool owner_on_cpu(struct task_struct *owner) { /* * As lock holder preemption issue, we both skip spinning if * task is not on cpu or its cpu is preempted */ return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner)); } /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu); #endif /* CONFIG_SMP */ #ifdef CONFIG_RSEQ /* * Map the event mask on the user-space ABI enum rseq_cs_flags * for direct mask checks. */ enum rseq_event_mask_bits { RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, }; enum rseq_event_mask { RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), }; static inline void rseq_set_notify_resume(struct task_struct *t) { if (t->rseq) set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { if (current->rseq) __rseq_handle_notify_resume(ksig, regs); } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { preempt_disable(); __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask); preempt_enable(); rseq_handle_notify_resume(ksig, regs); } /* rseq_preempt() requires preemption to be disabled. */ static inline void rseq_preempt(struct task_struct *t) { __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); rseq_set_notify_resume(t); } /* rseq_migrate() requires preemption to be disabled. */ static inline void rseq_migrate(struct task_struct *t) { __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); rseq_set_notify_resume(t); } /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { if (clone_flags & CLONE_VM) { t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } else { t->rseq = current->rseq; t->rseq_len = current->rseq_len; t->rseq_sig = current->rseq_sig; t->rseq_event_mask = current->rseq_event_mask; } } static inline void rseq_execve(struct task_struct *t) { t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; t->rseq_event_mask = 0; } #else static inline void rseq_set_notify_resume(struct task_struct *t) { } static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_preempt(struct task_struct *t) { } static inline void rseq_migrate(struct task_struct *t) { } static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } #endif #ifdef CONFIG_DEBUG_RSEQ void rseq_syscall(struct pt_regs *regs); #else static inline void rseq_syscall(struct pt_regs *regs) { } #endif #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, unsigned long uaddr); extern int sched_core_idle_cpu(int cpu); #else static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } #endif extern void sched_set_stop_task(int cpu, struct task_struct *stop); #endif
151 3 98 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 // SPDX-License-Identifier: GPL-2.0-or-later /* * Spanning tree protocol; timer-related code * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/times.h> #include "br_private.h" #include "br_private_stp.h" /* called under bridge lock */ static int br_is_designated_for_some_port(const struct net_bridge *br) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED && !memcmp(&p->designated_bridge, &br->bridge_id, 8)) return 1; } return 0; } static void br_hello_timer_expired(struct timer_list *t) { struct net_bridge *br = from_timer(br, t, hello_timer); br_debug(br, "hello timer expired\n"); spin_lock(&br->lock); if (br->dev->flags & IFF_UP) { br_config_bpdu_generation(br); if (br->stp_enabled == BR_KERNEL_STP) mod_timer(&br->hello_timer, round_jiffies(jiffies + br->hello_time)); } spin_unlock(&br->lock); } static void br_message_age_timer_expired(struct timer_list *t) { struct net_bridge_port *p = from_timer(p, t, message_age_timer); struct net_bridge *br = p->br; const bridge_id *id = &p->designated_bridge; int was_root; if (p->state == BR_STATE_DISABLED) return; br_info(br, "port %u(%s) neighbor %.2x%.2x.%pM lost\n", (unsigned int) p->port_no, p->dev->name, id->prio[0], id->prio[1], &id->addr); /* * According to the spec, the message age timer cannot be * running when we are the root bridge. So.. this was_root * check is redundant. I'm leaving it in for now, though. */ spin_lock(&br->lock); if (p->state == BR_STATE_DISABLED) goto unlock; was_root = br_is_root_bridge(br); br_become_designated_port(p); br_configuration_update(br); br_port_state_selection(br); if (br_is_root_bridge(br) && !was_root) br_become_root_bridge(br); unlock: spin_unlock(&br->lock); } static void br_forward_delay_timer_expired(struct timer_list *t) { struct net_bridge_port *p = from_timer(p, t, forward_delay_timer); struct net_bridge *br = p->br; br_debug(br, "port %u(%s) forward delay timer\n", (unsigned int) p->port_no, p->dev->name); spin_lock(&br->lock); if (p->state == BR_STATE_LISTENING) { br_set_state(p, BR_STATE_LEARNING); mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay); } else if (p->state == BR_STATE_LEARNING) { br_set_state(p, BR_STATE_FORWARDING); if (br_is_designated_for_some_port(br)) br_topology_change_detection(br); netif_carrier_on(br->dev); } rcu_read_lock(); br_ifinfo_notify(RTM_NEWLINK, NULL, p); rcu_read_unlock(); spin_unlock(&br->lock); } static void br_tcn_timer_expired(struct timer_list *t) { struct net_bridge *br = from_timer(br, t, tcn_timer); br_debug(br, "tcn timer expired\n"); spin_lock(&br->lock); if (!br_is_root_bridge(br) && (br->dev->flags & IFF_UP)) { br_transmit_tcn(br); mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time); } spin_unlock(&br->lock); } static void br_topology_change_timer_expired(struct timer_list *t) { struct net_bridge *br = from_timer(br, t, topology_change_timer); br_debug(br, "topo change timer expired\n"); spin_lock(&br->lock); br->topology_change_detected = 0; __br_set_topology_change(br, 0); spin_unlock(&br->lock); } static void br_hold_timer_expired(struct timer_list *t) { struct net_bridge_port *p = from_timer(p, t, hold_timer); br_debug(p->br, "port %u(%s) hold timer expired\n", (unsigned int) p->port_no, p->dev->name); spin_lock(&p->br->lock); if (p->config_pending) br_transmit_config(p); spin_unlock(&p->br->lock); } void br_stp_timer_init(struct net_bridge *br) { timer_setup(&br->hello_timer, br_hello_timer_expired, 0); timer_setup(&br->tcn_timer, br_tcn_timer_expired, 0); timer_setup(&br->topology_change_timer, br_topology_change_timer_expired, 0); } void br_stp_port_timer_init(struct net_bridge_port *p) { timer_setup(&p->message_age_timer, br_message_age_timer_expired, 0); timer_setup(&p->forward_delay_timer, br_forward_delay_timer_expired, 0); timer_setup(&p->hold_timer, br_hold_timer_expired, 0); } /* Report ticks left (in USER_HZ) used for API */ unsigned long br_timer_value(const struct timer_list *timer) { return timer_pending(timer) ? jiffies_delta_to_clock_t(timer->expires - jiffies) : 0; }
15 1754 1754 3 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 // SPDX-License-Identifier: GPL-2.0-only /* * CAIF Interface registration. * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland * * Borrowed heavily from file: pn_dev.c. Thanks to Remi Denis-Courmont * and Sakari Ailus <sakari.ailus@nokia.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/kernel.h> #include <linux/if_arp.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/mutex.h> #include <linux/module.h> #include <linux/spinlock.h> #include <net/netns/generic.h> #include <net/net_namespace.h> #include <net/pkt_sched.h> #include <net/caif/caif_device.h> #include <net/caif/caif_layer.h> #include <net/caif/caif_dev.h> #include <net/caif/cfpkt.h> #include <net/caif/cfcnfg.h> #include <net/caif/cfserl.h> MODULE_LICENSE("GPL"); /* Used for local tracking of the CAIF net devices */ struct caif_device_entry { struct cflayer layer; struct list_head list; struct net_device *netdev; int __percpu *pcpu_refcnt; spinlock_t flow_lock; struct sk_buff *xoff_skb; void (*xoff_skb_dtor)(struct sk_buff *skb); bool xoff; }; struct caif_device_entry_list { struct list_head list; /* Protects simulanous deletes in list */ struct mutex lock; }; struct caif_net { struct cfcnfg *cfg; struct caif_device_entry_list caifdevs; }; static unsigned int caif_net_id; static int q_high = 50; /* Percent */ struct cfcnfg *get_cfcnfg(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); return caifn->cfg; } EXPORT_SYMBOL(get_cfcnfg); static struct caif_device_entry_list *caif_device_list(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); return &caifn->caifdevs; } static void caifd_put(struct caif_device_entry *e) { this_cpu_dec(*e->pcpu_refcnt); } static void caifd_hold(struct caif_device_entry *e) { this_cpu_inc(*e->pcpu_refcnt); } static int caifd_refcnt_read(struct caif_device_entry *e) { int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(e->pcpu_refcnt, i); return refcnt; } /* Allocate new CAIF device. */ static struct caif_device_entry *caif_device_alloc(struct net_device *dev) { struct caif_device_entry *caifd; caifd = kzalloc(sizeof(*caifd), GFP_KERNEL); if (!caifd) return NULL; caifd->pcpu_refcnt = alloc_percpu(int); if (!caifd->pcpu_refcnt) { kfree(caifd); return NULL; } caifd->netdev = dev; dev_hold(dev); return caifd; } static struct caif_device_entry *caif_get(struct net_device *dev) { struct caif_device_entry_list *caifdevs = caif_device_list(dev_net(dev)); struct caif_device_entry *caifd; list_for_each_entry_rcu(caifd, &caifdevs->list, list, lockdep_rtnl_is_held()) { if (caifd->netdev == dev) return caifd; } return NULL; } static void caif_flow_cb(struct sk_buff *skb) { struct caif_device_entry *caifd; void (*dtor)(struct sk_buff *skb) = NULL; bool send_xoff; WARN_ON(skb->dev == NULL); rcu_read_lock(); caifd = caif_get(skb->dev); WARN_ON(caifd == NULL); if (!caifd) { rcu_read_unlock(); return; } caifd_hold(caifd); rcu_read_unlock(); spin_lock_bh(&caifd->flow_lock); send_xoff = caifd->xoff; caifd->xoff = false; dtor = caifd->xoff_skb_dtor; if (WARN_ON(caifd->xoff_skb != skb)) skb = NULL; caifd->xoff_skb = NULL; caifd->xoff_skb_dtor = NULL; spin_unlock_bh(&caifd->flow_lock); if (dtor && skb) dtor(skb); if (send_xoff) caifd->layer.up-> ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND, caifd->layer.id); caifd_put(caifd); } static int transmit(struct cflayer *layer, struct cfpkt *pkt) { int err, high = 0, qlen = 0; struct caif_device_entry *caifd = container_of(layer, struct caif_device_entry, layer); struct sk_buff *skb; struct netdev_queue *txq; rcu_read_lock_bh(); skb = cfpkt_tonative(pkt); skb->dev = caifd->netdev; skb_reset_network_header(skb); skb->protocol = htons(ETH_P_CAIF); /* Check if we need to handle xoff */ if (likely(caifd->netdev->priv_flags & IFF_NO_QUEUE)) goto noxoff; if (unlikely(caifd->xoff)) goto noxoff; if (likely(!netif_queue_stopped(caifd->netdev))) { struct Qdisc *sch; /* If we run with a TX queue, check if the queue is too long*/ txq = netdev_get_tx_queue(skb->dev, 0); sch = rcu_dereference_bh(txq->qdisc); if (likely(qdisc_is_empty(sch))) goto noxoff; /* can check for explicit qdisc len value only !NOLOCK, * always set flow off otherwise */ high = (caifd->netdev->tx_queue_len * q_high) / 100; if (!(sch->flags & TCQ_F_NOLOCK) && likely(sch->q.qlen < high)) goto noxoff; } /* Hold lock while accessing xoff */ spin_lock_bh(&caifd->flow_lock); if (caifd->xoff) { spin_unlock_bh(&caifd->flow_lock); goto noxoff; } /* * Handle flow off, we do this by temporary hi-jacking this * skb's destructor function, and replace it with our own * flow-on callback. The callback will set flow-on and call * the original destructor. */ pr_debug("queue has stopped(%d) or is full (%d > %d)\n", netif_queue_stopped(caifd->netdev), qlen, high); caifd->xoff = true; caifd->xoff_skb = skb; caifd->xoff_skb_dtor = skb->destructor; skb->destructor = caif_flow_cb; spin_unlock_bh(&caifd->flow_lock); caifd->layer.up->ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND, caifd->layer.id); noxoff: rcu_read_unlock_bh(); err = dev_queue_xmit(skb); if (err > 0) err = -EIO; return err; } /* * Stuff received packets into the CAIF stack. * On error, returns non-zero and releases the skb. */ static int receive(struct sk_buff *skb, struct net_device *dev, struct packet_type *pkttype, struct net_device *orig_dev) { struct cfpkt *pkt; struct caif_device_entry *caifd; int err; pkt = cfpkt_fromnative(CAIF_DIR_IN, skb); rcu_read_lock(); caifd = caif_get(dev); if (!caifd || !caifd->layer.up || !caifd->layer.up->receive || !netif_oper_up(caifd->netdev)) { rcu_read_unlock(); kfree_skb(skb); return NET_RX_DROP; } /* Hold reference to netdevice while using CAIF stack */ caifd_hold(caifd); rcu_read_unlock(); err = caifd->layer.up->receive(caifd->layer.up, pkt); /* For -EILSEQ the packet is not freed so free it now */ if (err == -EILSEQ) cfpkt_destroy(pkt); /* Release reference to stack upwards */ caifd_put(caifd); if (err != 0) err = NET_RX_DROP; return err; } static struct packet_type caif_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_CAIF), .func = receive, }; static void dev_flowctrl(struct net_device *dev, int on) { struct caif_device_entry *caifd; rcu_read_lock(); caifd = caif_get(dev); if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) { rcu_read_unlock(); return; } caifd_hold(caifd); rcu_read_unlock(); caifd->layer.up->ctrlcmd(caifd->layer.up, on ? _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND : _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND, caifd->layer.id); caifd_put(caifd); } int caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, struct cflayer *link_support, int head_room, struct cflayer **layer, int (**rcv_func)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *)) { struct caif_device_entry *caifd; enum cfcnfg_phy_preference pref; struct cfcnfg *cfg = get_cfcnfg(dev_net(dev)); struct caif_device_entry_list *caifdevs; int res; caifdevs = caif_device_list(dev_net(dev)); caifd = caif_device_alloc(dev); if (!caifd) return -ENOMEM; *layer = &caifd->layer; spin_lock_init(&caifd->flow_lock); switch (caifdev->link_select) { case CAIF_LINK_HIGH_BANDW: pref = CFPHYPREF_HIGH_BW; break; case CAIF_LINK_LOW_LATENCY: pref = CFPHYPREF_LOW_LAT; break; default: pref = CFPHYPREF_HIGH_BW; break; } mutex_lock(&caifdevs->lock); list_add_rcu(&caifd->list, &caifdevs->list); strscpy(caifd->layer.name, dev->name, sizeof(caifd->layer.name)); caifd->layer.transmit = transmit; res = cfcnfg_add_phy_layer(cfg, dev, &caifd->layer, pref, link_support, caifdev->use_fcs, head_room); mutex_unlock(&caifdevs->lock); if (rcv_func) *rcv_func = receive; return res; } EXPORT_SYMBOL(caif_enroll_dev); /* notify Caif of device events */ static int caif_device_notify(struct notifier_block *me, unsigned long what, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct caif_device_entry *caifd = NULL; struct caif_dev_common *caifdev; struct cfcnfg *cfg; struct cflayer *layer, *link_support; int head_room = 0; struct caif_device_entry_list *caifdevs; int res; cfg = get_cfcnfg(dev_net(dev)); caifdevs = caif_device_list(dev_net(dev)); caifd = caif_get(dev); if (caifd == NULL && dev->type != ARPHRD_CAIF) return 0; switch (what) { case NETDEV_REGISTER: if (caifd != NULL) break; caifdev = netdev_priv(dev); link_support = NULL; if (caifdev->use_frag) { head_room = 1; link_support = cfserl_create(dev->ifindex, caifdev->use_stx); if (!link_support) { pr_warn("Out of memory\n"); break; } } res = caif_enroll_dev(dev, caifdev, link_support, head_room, &layer, NULL); if (res) cfserl_release(link_support); caifdev->flowctrl = dev_flowctrl; break; case NETDEV_UP: rcu_read_lock(); caifd = caif_get(dev); if (caifd == NULL) { rcu_read_unlock(); break; } caifd->xoff = false; cfcnfg_set_phy_state(cfg, &caifd->layer, true); rcu_read_unlock(); break; case NETDEV_DOWN: rcu_read_lock(); caifd = caif_get(dev); if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) { rcu_read_unlock(); return -EINVAL; } cfcnfg_set_phy_state(cfg, &caifd->layer, false); caifd_hold(caifd); rcu_read_unlock(); caifd->layer.up->ctrlcmd(caifd->layer.up, _CAIF_CTRLCMD_PHYIF_DOWN_IND, caifd->layer.id); spin_lock_bh(&caifd->flow_lock); /* * Replace our xoff-destructor with original destructor. * We trust that skb->destructor *always* is called before * the skb reference is invalid. The hijacked SKB destructor * takes the flow_lock so manipulating the skb->destructor here * should be safe. */ if (caifd->xoff_skb_dtor != NULL && caifd->xoff_skb != NULL) caifd->xoff_skb->destructor = caifd->xoff_skb_dtor; caifd->xoff = false; caifd->xoff_skb_dtor = NULL; caifd->xoff_skb = NULL; spin_unlock_bh(&caifd->flow_lock); caifd_put(caifd); break; case NETDEV_UNREGISTER: mutex_lock(&caifdevs->lock); caifd = caif_get(dev); if (caifd == NULL) { mutex_unlock(&caifdevs->lock); break; } list_del_rcu(&caifd->list); /* * NETDEV_UNREGISTER is called repeatedly until all reference * counts for the net-device are released. If references to * caifd is taken, simply ignore NETDEV_UNREGISTER and wait for * the next call to NETDEV_UNREGISTER. * * If any packets are in flight down the CAIF Stack, * cfcnfg_del_phy_layer will return nonzero. * If no packets are in flight, the CAIF Stack associated * with the net-device un-registering is freed. */ if (caifd_refcnt_read(caifd) != 0 || cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0) { pr_info("Wait for device inuse\n"); /* Enrole device if CAIF Stack is still in use */ list_add_rcu(&caifd->list, &caifdevs->list); mutex_unlock(&caifdevs->lock); break; } synchronize_rcu(); dev_put(caifd->netdev); free_percpu(caifd->pcpu_refcnt); kfree(caifd); mutex_unlock(&caifdevs->lock); break; } return 0; } static struct notifier_block caif_device_notifier = { .notifier_call = caif_device_notify, .priority = 0, }; /* Per-namespace Caif devices handling */ static int caif_init_net(struct net *net) { struct caif_net *caifn = net_generic(net, caif_net_id); INIT_LIST_HEAD(&caifn->caifdevs.list); mutex_init(&caifn->caifdevs.lock); caifn->cfg = cfcnfg_create(); if (!caifn->cfg) return -ENOMEM; return 0; } static void caif_exit_net(struct net *net) { struct caif_device_entry *caifd, *tmp; struct caif_device_entry_list *caifdevs = caif_device_list(net); struct cfcnfg *cfg = get_cfcnfg(net); rtnl_lock(); mutex_lock(&caifdevs->lock); list_for_each_entry_safe(caifd, tmp, &caifdevs->list, list) { int i = 0; list_del_rcu(&caifd->list); cfcnfg_set_phy_state(cfg, &caifd->layer, false); while (i < 10 && (caifd_refcnt_read(caifd) != 0 || cfcnfg_del_phy_layer(cfg, &caifd->layer) != 0)) { pr_info("Wait for device inuse\n"); msleep(250); i++; } synchronize_rcu(); dev_put(caifd->netdev); free_percpu(caifd->pcpu_refcnt); kfree(caifd); } cfcnfg_remove(cfg); mutex_unlock(&caifdevs->lock); rtnl_unlock(); } static struct pernet_operations caif_net_ops = { .init = caif_init_net, .exit = caif_exit_net, .id = &caif_net_id, .size = sizeof(struct caif_net), }; /* Initialize Caif devices list */ static int __init caif_device_init(void) { int result; result = register_pernet_subsys(&caif_net_ops); if (result) return result; register_netdevice_notifier(&caif_device_notifier); dev_add_pack(&caif_packet_type); return result; } static void __exit caif_device_exit(void) { unregister_netdevice_notifier(&caif_device_notifier); dev_remove_pack(&caif_packet_type); unregister_pernet_subsys(&caif_net_ops); } module_init(caif_device_init); module_exit(caif_device_exit);
39 39 39 24 39 39 39 39 39 38 39 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2019 ARM Ltd. * * Generic implementation of update_vsyscall and update_vsyscall_tz. * * Based on the x86 specific implementation. */ #include <linux/hrtimer.h> #include <linux/timekeeper_internal.h> #include <vdso/datapage.h> #include <vdso/helpers.h> #include <vdso/vsyscall.h> #include "timekeeping_internal.h" static inline void update_vdso_data(struct vdso_data *vdata, struct timekeeper *tk) { struct vdso_timestamp *vdso_ts; u64 nsec, sec; vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; vdata[CS_RAW].mask = tk->tkr_raw.mask; vdata[CS_RAW].mult = tk->tkr_raw.mult; vdata[CS_RAW].shift = tk->tkr_raw.shift; /* CLOCK_MONOTONIC */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* Copy MONOTONIC time for BOOTTIME */ sec = vdso_ts->sec; /* Add the boot offset */ sec += tk->monotonic_to_boot.tv_sec; nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { struct vdso_data *vdata = __arch_get_k_vdso_data(); struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; /* copy vsyscall data */ vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; vdata[CS_HRES_COARSE].clock_mode = clock_mode; vdata[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; /* CLOCK_MONOTONIC_COARSE */ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); /* * Read without the seqlock held by clock_getres(). * Note: No need to have a second copy. */ WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_data(vdata, tk); __arch_update_vsyscall(vdata, tk); vdso_write_end(vdata); __arch_sync_vdso_data(vdata); } void update_vsyscall_tz(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; __arch_sync_vdso_data(vdata); } /** * vdso_update_begin - Start of a VDSO update section * * Allows architecture code to safely update the architecture specific VDSO * data. Disables interrupts, acquires timekeeper lock to serialize against * concurrent updates from timekeeping and invalidates the VDSO data * sequence counter to prevent concurrent readers from accessing * inconsistent data. * * Returns: Saved interrupt flags which need to be handed in to * vdso_update_end(). */ unsigned long vdso_update_begin(void) { struct vdso_data *vdata = __arch_get_k_vdso_data(); unsigned long flags; raw_spin_lock_irqsave(&timekeeper_lock, flags); vdso_write_begin(vdata); return flags; } /** * vdso_update_end - End of a VDSO update section * @flags: Interrupt flags as returned from vdso_update_begin() * * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data * synchronization if the architecture requires it, drops timekeeper lock * and restores interrupt flags. */ void vdso_update_end(unsigned long flags) { struct vdso_data *vdata = __arch_get_k_vdso_data(); vdso_write_end(vdata); __arch_sync_vdso_data(vdata); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); }
78 74 73 2 77 162 76 77 90 90 89 4 3 2 86 86 51 157 35 78 5 71 2 87 87 86 2 48 37 87 3 3 3 2 2 3 1 1 1 11 2 2 1 1 1 2 1 78 76 77 23 74 45 2 2 2 43 43 4 4 4 8 4 4 4 4 9 4 4 4 4 4 4 1 4 4 4 9 52 3 3 52 587 586 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 // SPDX-License-Identifier: GPL-2.0-or-later /* * Extension Header handling for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Andi Kleen <ak@muc.de> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ /* Changes: * yoshfuji : ensure not to overrun while parsing * tlv options. * Mitsuru KANDA @USAGI and: Remove ipv6_parse_exthdrs(). * YOSHIFUJI Hideaki @USAGI Register inbound extension header * handlers as inet6_protocol{}. */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/slab.h> #include <linux/export.h> #include <net/dst.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/rawv6.h> #include <net/ndisc.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/calipso.h> #if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/xfrm.h> #endif #include <linux/seg6.h> #include <net/seg6.h> #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif #include <net/rpl.h> #include <linux/ioam6.h> #include <net/ioam6.h> #include <net/dst_metadata.h> #include <linux/uaccess.h> /********************* Generic functions *********************/ /* An unknown option is detected, decide what to do */ static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff, bool disallow_unknowns) { if (disallow_unknowns) { /* If unknown TLVs are disallowed by configuration * then always silently drop packet. Note this also * means no ICMP parameter problem is sent which * could be a good property to mitigate a reflection DOS * attack. */ goto drop; } switch ((skb_network_header(skb)[optoff] & 0xC0) >> 6) { case 0: /* ignore */ return true; case 1: /* drop packet */ break; case 3: /* Send ICMP if not a multicast address and drop packet */ /* Actually, it is redundant check. icmp_send will recheck in any case. */ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) break; fallthrough; case 2: /* send ICMP PARM PROB regardless and drop packet */ icmpv6_param_prob_reason(skb, ICMPV6_UNK_OPTION, optoff, SKB_DROP_REASON_UNHANDLED_PROTO); return false; } drop: kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); return false; } static bool ipv6_hop_ra(struct sk_buff *skb, int optoff); static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff); static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff); static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff); #if IS_ENABLED(CONFIG_IPV6_MIP6) static bool ipv6_dest_hao(struct sk_buff *skb, int optoff); #endif /* Parse tlv encoded option header (hop-by-hop or destination) */ static bool ip6_parse_tlv(bool hopbyhop, struct sk_buff *skb, int max_count) { int len = (skb_transport_header(skb)[1] + 1) << 3; const unsigned char *nh = skb_network_header(skb); int off = skb_network_header_len(skb); bool disallow_unknowns = false; int tlv_count = 0; int padlen = 0; if (unlikely(max_count < 0)) { disallow_unknowns = true; max_count = -max_count; } off += 2; len -= 2; while (len > 0) { int optlen, i; if (nh[off] == IPV6_TLV_PAD1) { padlen++; if (padlen > 7) goto bad; off++; len--; continue; } if (len < 2) goto bad; optlen = nh[off + 1] + 2; if (optlen > len) goto bad; if (nh[off] == IPV6_TLV_PADN) { /* RFC 2460 states that the purpose of PadN is * to align the containing header to multiples * of 8. 7 is therefore the highest valid value. * See also RFC 4942, Section 2.1.9.5. */ padlen += optlen; if (padlen > 7) goto bad; /* RFC 4942 recommends receiving hosts to * actively check PadN payload to contain * only zeroes. */ for (i = 2; i < optlen; i++) { if (nh[off + i] != 0) goto bad; } } else { tlv_count++; if (tlv_count > max_count) goto bad; if (hopbyhop) { switch (nh[off]) { case IPV6_TLV_ROUTERALERT: if (!ipv6_hop_ra(skb, off)) return false; break; case IPV6_TLV_IOAM: if (!ipv6_hop_ioam(skb, off)) return false; break; case IPV6_TLV_JUMBO: if (!ipv6_hop_jumbo(skb, off)) return false; break; case IPV6_TLV_CALIPSO: if (!ipv6_hop_calipso(skb, off)) return false; break; default: if (!ip6_tlvopt_unknown(skb, off, disallow_unknowns)) return false; break; } } else { switch (nh[off]) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_TLV_HAO: if (!ipv6_dest_hao(skb, off)) return false; break; #endif default: if (!ip6_tlvopt_unknown(skb, off, disallow_unknowns)) return false; break; } } padlen = 0; } off += optlen; len -= optlen; } if (len == 0) return true; bad: kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); return false; } /***************************** Destination options header. *****************************/ #if IS_ENABLED(CONFIG_IPV6_MIP6) static bool ipv6_dest_hao(struct sk_buff *skb, int optoff) { struct ipv6_destopt_hao *hao; struct inet6_skb_parm *opt = IP6CB(skb); struct ipv6hdr *ipv6h = ipv6_hdr(skb); SKB_DR(reason); int ret; if (opt->dsthao) { net_dbg_ratelimited("hao duplicated\n"); goto discard; } opt->dsthao = opt->dst1; opt->dst1 = 0; hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff); if (hao->length != 16) { net_dbg_ratelimited("hao invalid option length = %d\n", hao->length); SKB_DR_SET(reason, IP_INHDR); goto discard; } if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) { net_dbg_ratelimited("hao is not an unicast addr: %pI6\n", &hao->addr); SKB_DR_SET(reason, INVALID_PROTO); goto discard; } ret = xfrm6_input_addr(skb, (xfrm_address_t *)&ipv6h->daddr, (xfrm_address_t *)&hao->addr, IPPROTO_DSTOPTS); if (unlikely(ret < 0)) { SKB_DR_SET(reason, XFRM_POLICY); goto discard; } if (skb_cloned(skb)) { if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) goto discard; /* update all variable using below by copied skbuff */ hao = (struct ipv6_destopt_hao *)(skb_network_header(skb) + optoff); ipv6h = ipv6_hdr(skb); } if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; swap(ipv6h->saddr, hao->addr); if (skb->tstamp == 0) __net_timestamp(skb); return true; discard: kfree_skb_reason(skb, reason); return false; } #endif static int ipv6_destopt_rcv(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) __u16 dstbuf; #endif struct dst_entry *dst = skb_dst(skb); struct net *net = dev_net(skb->dev); int extlen; if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { __IP6_INC_STATS(dev_net(dst->dev), idev, IPSTATS_MIB_INHDRERRORS); fail_and_free: kfree_skb(skb); return -1; } extlen = (skb_transport_header(skb)[1] + 1) << 3; if (extlen > net->ipv6.sysctl.max_dst_opts_len) goto fail_and_free; opt->lastopt = opt->dst1 = skb_network_header_len(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) dstbuf = opt->dst1; #endif if (ip6_parse_tlv(false, skb, net->ipv6.sysctl.max_dst_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) opt->nhoff = dstbuf; #else opt->nhoff = opt->dst1; #endif return 1; } __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); return -1; } static void seg6_update_csum(struct sk_buff *skb) { struct ipv6_sr_hdr *hdr; struct in6_addr *addr; __be32 from, to; /* srh is at transport offset and seg_left is already decremented * but daddr is not yet updated with next segment */ hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); addr = hdr->segments + hdr->segments_left; hdr->segments_left++; from = *(__be32 *)hdr; hdr->segments_left--; to = *(__be32 *)hdr; /* update skb csum with diff resulting from seg_left decrement */ update_csum_diff4(skb, from, to); /* compute csum diff between current and next segment and update */ update_csum_diff16(skb, (__be32 *)(&ipv6_hdr(skb)->daddr), (__be32 *)addr); } static int ipv6_srh_rcv(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(skb->dev); struct ipv6_sr_hdr *hdr; struct inet6_dev *idev; struct in6_addr *addr; int accept_seg6; hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); idev = __in6_dev_get(skb->dev); accept_seg6 = net->ipv6.devconf_all->seg6_enabled; if (accept_seg6 > idev->cnf.seg6_enabled) accept_seg6 = idev->cnf.seg6_enabled; if (!accept_seg6) { kfree_skb(skb); return -1; } #ifdef CONFIG_IPV6_SEG6_HMAC if (!seg6_hmac_validate_skb(skb)) { kfree_skb(skb); return -1; } #endif looped_back: if (hdr->segments_left == 0) { if (hdr->nexthdr == NEXTHDR_IPV6 || hdr->nexthdr == NEXTHDR_IPV4) { int offset = (hdr->hdrlen + 1) << 3; skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); skb_pull(skb, offset); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; if (hdr->nexthdr == NEXTHDR_IPV4) skb->protocol = htons(ETH_P_IP); __skb_tunnel_rx(skb, skb->dev, net); netif_rx(skb); return -1; } opt->srcrt = skb_network_header_len(skb); opt->lastopt = opt->srcrt; skb->transport_header += (hdr->hdrlen + 1) << 3; opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); return 1; } if (hdr->segments_left >= (hdr->hdrlen >> 1)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); return -1; } if (skb_cloned(skb)) { if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return -1; } hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb); } hdr->segments_left--; addr = hdr->segments + hdr->segments_left; skb_push(skb, sizeof(struct ipv6hdr)); if (skb->ip_summed == CHECKSUM_COMPLETE) seg6_update_csum(skb); ipv6_hdr(skb)->daddr = *addr; ip6_route_input(skb); if (skb_dst(skb)->error) { dst_input(skb); return -1; } if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); return -1; } ipv6_hdr(skb)->hop_limit--; skb_pull(skb, sizeof(struct ipv6hdr)); goto looped_back; } dst_input(skb); return -1; } static int ipv6_rpl_srh_rcv(struct sk_buff *skb) { struct ipv6_rpl_sr_hdr *hdr, *ohdr, *chdr; struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(skb->dev); struct inet6_dev *idev; struct ipv6hdr *oldhdr; unsigned char *buf; int accept_rpl_seg; int i, err; u64 n = 0; u32 r; idev = __in6_dev_get(skb->dev); accept_rpl_seg = net->ipv6.devconf_all->rpl_seg_enabled; if (accept_rpl_seg > idev->cnf.rpl_seg_enabled) accept_rpl_seg = idev->cnf.rpl_seg_enabled; if (!accept_rpl_seg) { kfree_skb(skb); return -1; } looped_back: hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb); if (hdr->segments_left == 0) { if (hdr->nexthdr == NEXTHDR_IPV6) { int offset = (hdr->hdrlen + 1) << 3; skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); skb_pull(skb, offset); skb_postpull_rcsum(skb, skb_transport_header(skb), offset); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->encapsulation = 0; __skb_tunnel_rx(skb, skb->dev, net); netif_rx(skb); return -1; } opt->srcrt = skb_network_header_len(skb); opt->lastopt = opt->srcrt; skb->transport_header += (hdr->hdrlen + 1) << 3; opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); return 1; } n = (hdr->hdrlen << 3) - hdr->pad - (16 - hdr->cmpre); r = do_div(n, (16 - hdr->cmpri)); /* checks if calculation was without remainder and n fits into * unsigned char which is segments_left field. Should not be * higher than that. */ if (r || (n + 1) > 255) { kfree_skb(skb); return -1; } if (hdr->segments_left > n + 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); return -1; } hdr->segments_left--; i = n - hdr->segments_left; buf = kcalloc(struct_size(hdr, segments.addr, n + 2), 2, GFP_ATOMIC); if (unlikely(!buf)) { kfree_skb(skb); return -1; } ohdr = (struct ipv6_rpl_sr_hdr *)buf; ipv6_rpl_srh_decompress(ohdr, hdr, &ipv6_hdr(skb)->daddr, n); chdr = (struct ipv6_rpl_sr_hdr *)(buf + ((ohdr->hdrlen + 1) << 3)); if (ipv6_addr_is_multicast(&ohdr->rpl_segaddr[i])) { kfree_skb(skb); kfree(buf); return -1; } err = ipv6_chk_rpl_srh_loop(net, ohdr->rpl_segaddr, n + 1); if (err) { icmpv6_send(skb, ICMPV6_PARAMPROB, 0, 0); kfree_skb(skb); kfree(buf); return -1; } swap(ipv6_hdr(skb)->daddr, ohdr->rpl_segaddr[i]); ipv6_rpl_srh_compress(chdr, ohdr, &ipv6_hdr(skb)->daddr, n); oldhdr = ipv6_hdr(skb); skb_pull(skb, ((hdr->hdrlen + 1) << 3)); skb_postpull_rcsum(skb, oldhdr, sizeof(struct ipv6hdr) + ((hdr->hdrlen + 1) << 3)); if (unlikely(!hdr->segments_left)) { if (pskb_expand_head(skb, sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3), 0, GFP_ATOMIC)) { __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); kfree(buf); return -1; } oldhdr = ipv6_hdr(skb); } skb_push(skb, ((chdr->hdrlen + 1) << 3) + sizeof(struct ipv6hdr)); skb_reset_network_header(skb); skb_mac_header_rebuild(skb); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); memmove(ipv6_hdr(skb), oldhdr, sizeof(struct ipv6hdr)); memcpy(skb_transport_header(skb), chdr, (chdr->hdrlen + 1) << 3); ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, ipv6_hdr(skb), sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3)); kfree(buf); ip6_route_input(skb); if (skb_dst(skb)->error) { dst_input(skb); return -1; } if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); return -1; } ipv6_hdr(skb)->hop_limit--; skb_pull(skb, sizeof(struct ipv6hdr)); goto looped_back; } dst_input(skb); return -1; } /******************************** Routing header. ********************************/ /* called with rcu_read_lock() */ static int ipv6_rthdr_rcv(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); struct inet6_skb_parm *opt = IP6CB(skb); struct in6_addr *addr = NULL; int n, i; struct ipv6_rt_hdr *hdr; struct rt0_hdr *rthdr; struct net *net = dev_net(skb->dev); int accept_source_route = net->ipv6.devconf_all->accept_source_route; if (idev && accept_source_route > idev->cnf.accept_source_route) accept_source_route = idev->cnf.accept_source_route; if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb); if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) || skb->pkt_type != PACKET_HOST) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } switch (hdr->type) { case IPV6_SRCRT_TYPE_4: /* segment routing */ return ipv6_srh_rcv(skb); case IPV6_SRCRT_TYPE_3: /* rpl segment routing */ return ipv6_rpl_srh_rcv(skb); default: break; } looped_back: if (hdr->segments_left == 0) { switch (hdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: /* Silently discard type 2 header unless it was * processed by own */ if (!addr) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } break; #endif default: break; } opt->lastopt = opt->srcrt = skb_network_header_len(skb); skb->transport_header += (hdr->hdrlen + 1) << 3; opt->dst0 = opt->dst1; opt->dst1 = 0; opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb); return 1; } switch (hdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (accept_source_route < 0) goto unknown_rh; /* Silently discard invalid RTH type 2 */ if (hdr->hdrlen != 2 || hdr->segments_left != 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } break; #endif default: goto unknown_rh; } /* * This is the routing header forwarding algorithm from * RFC 2460, page 16. */ n = hdr->hdrlen >> 1; if (hdr->segments_left > n) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); return -1; } /* We are about to mangle packet header. Be careful! Do not damage packets queued somewhere. */ if (skb_cloned(skb)) { /* the copy is a forwarded packet */ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return -1; } hdr = (struct ipv6_rt_hdr *)skb_transport_header(skb); } if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; i = n - --hdr->segments_left; rthdr = (struct rt0_hdr *) hdr; addr = rthdr->addr; addr += i - 1; switch (hdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (xfrm6_input_addr(skb, (xfrm_address_t *)addr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, IPPROTO_ROUTING) < 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } break; #endif default: break; } if (ipv6_addr_is_multicast(addr)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } swap(*addr, ipv6_hdr(skb)->daddr); ip6_route_input(skb); if (skb_dst(skb)->error) { skb_push(skb, skb->data - skb_network_header(skb)); dst_input(skb); return -1; } if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); return -1; } ipv6_hdr(skb)->hop_limit--; goto looped_back; } skb_push(skb, skb->data - skb_network_header(skb)); dst_input(skb); return -1; unknown_rh: __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->type) - skb_network_header(skb)); return -1; } static const struct inet6_protocol rthdr_protocol = { .handler = ipv6_rthdr_rcv, .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol destopt_protocol = { .handler = ipv6_destopt_rcv, .flags = INET6_PROTO_NOPOLICY, }; static const struct inet6_protocol nodata_protocol = { .handler = dst_discard, .flags = INET6_PROTO_NOPOLICY, }; int __init ipv6_exthdrs_init(void) { int ret; ret = inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING); if (ret) goto out; ret = inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS); if (ret) goto out_rthdr; ret = inet6_add_protocol(&nodata_protocol, IPPROTO_NONE); if (ret) goto out_destopt; out: return ret; out_destopt: inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); out_rthdr: inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); goto out; }; void ipv6_exthdrs_exit(void) { inet6_del_protocol(&nodata_protocol, IPPROTO_NONE); inet6_del_protocol(&destopt_protocol, IPPROTO_DSTOPTS); inet6_del_protocol(&rthdr_protocol, IPPROTO_ROUTING); } /********************************** Hop-by-hop options. **********************************/ /* * Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input(). */ static inline struct net *ipv6_skb_net(struct sk_buff *skb) { return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev); } /* Router Alert as of RFC 2711 */ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); if (nh[optoff + 1] == 2) { IP6CB(skb)->flags |= IP6SKB_ROUTERALERT; memcpy(&IP6CB(skb)->ra, nh + optoff + 2, sizeof(IP6CB(skb)->ra)); return true; } net_dbg_ratelimited("ipv6_hop_ra: wrong RA length %d\n", nh[optoff + 1]); kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); return false; } /* IOAM */ static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) { struct ioam6_trace_hdr *trace; struct ioam6_namespace *ns; struct ioam6_hdr *hdr; /* Bad alignment (must be 4n-aligned) */ if (optoff & 3) goto drop; /* Ignore if IOAM is not enabled on ingress */ if (!__in6_dev_get(skb->dev)->cnf.ioam6_enabled) goto ignore; /* Truncated Option header */ hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff); if (hdr->opt_len < 2) goto drop; switch (hdr->type) { case IOAM6_TYPE_PREALLOC: /* Truncated Pre-allocated Trace header */ if (hdr->opt_len < 2 + sizeof(*trace)) goto drop; /* Malformed Pre-allocated Trace header */ trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr)); if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4) goto drop; /* Ignore if the IOAM namespace is unknown */ ns = ioam6_namespace(ipv6_skb_net(skb), trace->namespace_id); if (!ns) goto ignore; if (!skb_valid_dst(skb)) ip6_route_input(skb); ioam6_fill_trace_data(skb, ns, trace, true); break; default: break; } ignore: return true; drop: kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); return false; } /* Jumbo payload */ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); SKB_DR(reason); u32 pkt_len; if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", nh[optoff+1]); SKB_DR_SET(reason, IP_INHDR); goto drop; } pkt_len = ntohl(*(__be32 *)(nh + optoff + 2)); if (pkt_len <= IPV6_MAXPLEN) { icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff + 2, SKB_DROP_REASON_IP_INHDR); return false; } if (ipv6_hdr(skb)->payload_len) { icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff, SKB_DROP_REASON_IP_INHDR); return false; } if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { SKB_DR_SET(reason, PKT_TOO_SMALL); goto drop; } if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) goto drop; IP6CB(skb)->flags |= IP6SKB_JUMBOGRAM; return true; drop: kfree_skb_reason(skb, reason); return false; } /* CALIPSO RFC 5570 */ static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff) { const unsigned char *nh = skb_network_header(skb); if (nh[optoff + 1] < 8) goto drop; if (nh[optoff + 6] * 4 + 8 > nh[optoff + 1]) goto drop; if (!calipso_validate(skb, nh + optoff)) goto drop; return true; drop: kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); return false; } int ipv6_parse_hopopts(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(skb->dev); int extlen; /* * skb_network_header(skb) is equal to skb->data, and * skb_network_header_len(skb) is always equal to * sizeof(struct ipv6hdr) by definition of * hop-by-hop options. */ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr) + 8) || !pskb_may_pull(skb, (sizeof(struct ipv6hdr) + ((skb_transport_header(skb)[1] + 1) << 3)))) { fail_and_free: kfree_skb(skb); return -1; } extlen = (skb_transport_header(skb)[1] + 1) << 3; if (extlen > net->ipv6.sysctl.max_hbh_opts_len) goto fail_and_free; opt->flags |= IP6SKB_HOPBYHOP; if (ip6_parse_tlv(true, skb, net->ipv6.sysctl.max_hbh_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); opt->nhoff = sizeof(struct ipv6hdr); return 1; } return -1; } /* * Creating outbound headers. * * "build" functions work when skb is filled from head to tail (datagram) * "push" functions work when headers are added from tail to head (tcp) * * In both cases we assume, that caller reserved enough room * for headers. */ static void ipv6_push_rthdr0(struct sk_buff *skb, u8 *proto, struct ipv6_rt_hdr *opt, struct in6_addr **addr_p, struct in6_addr *saddr) { struct rt0_hdr *phdr, *ihdr; int hops; ihdr = (struct rt0_hdr *) opt; phdr = skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3); memcpy(phdr, ihdr, sizeof(struct rt0_hdr)); hops = ihdr->rt_hdr.hdrlen >> 1; if (hops > 1) memcpy(phdr->addr, ihdr->addr + 1, (hops - 1) * sizeof(struct in6_addr)); phdr->addr[hops - 1] = **addr_p; *addr_p = ihdr->addr; phdr->rt_hdr.nexthdr = *proto; *proto = NEXTHDR_ROUTING; } static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto, struct ipv6_rt_hdr *opt, struct in6_addr **addr_p, struct in6_addr *saddr) { struct ipv6_sr_hdr *sr_phdr, *sr_ihdr; int plen, hops; sr_ihdr = (struct ipv6_sr_hdr *)opt; plen = (sr_ihdr->hdrlen + 1) << 3; sr_phdr = skb_push(skb, plen); memcpy(sr_phdr, sr_ihdr, sizeof(struct ipv6_sr_hdr)); hops = sr_ihdr->first_segment + 1; memcpy(sr_phdr->segments + 1, sr_ihdr->segments + 1, (hops - 1) * sizeof(struct in6_addr)); sr_phdr->segments[0] = **addr_p; *addr_p = &sr_ihdr->segments[sr_ihdr->segments_left]; if (sr_ihdr->hdrlen > hops * 2) { int tlvs_offset, tlvs_length; tlvs_offset = (1 + hops * 2) << 3; tlvs_length = (sr_ihdr->hdrlen - hops * 2) << 3; memcpy((char *)sr_phdr + tlvs_offset, (char *)sr_ihdr + tlvs_offset, tlvs_length); } #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(sr_phdr)) { struct net *net = NULL; if (skb->dev) net = dev_net(skb->dev); else if (skb->sk) net = sock_net(skb->sk); WARN_ON(!net); if (net) seg6_push_hmac(net, saddr, sr_phdr); } #endif sr_phdr->nexthdr = *proto; *proto = NEXTHDR_ROUTING; } static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, struct ipv6_rt_hdr *opt, struct in6_addr **addr_p, struct in6_addr *saddr) { switch (opt->type) { case IPV6_SRCRT_TYPE_0: case IPV6_SRCRT_STRICT: case IPV6_SRCRT_TYPE_2: ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr); break; case IPV6_SRCRT_TYPE_4: ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr); break; default: break; } } static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt) { struct ipv6_opt_hdr *h = skb_push(skb, ipv6_optlen(opt)); memcpy(h, opt, ipv6_optlen(opt)); h->nexthdr = *proto; *proto = type; } void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto, struct in6_addr **daddr, struct in6_addr *saddr) { if (opt->srcrt) { ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr); /* * IPV6_RTHDRDSTOPTS is ignored * unless IPV6_RTHDR is set (RFC3542). */ if (opt->dst0opt) ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt); } if (opt->hopopt) ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); } void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto) { if (opt->dst1opt) ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt); } EXPORT_SYMBOL(ipv6_push_frag_opts); struct ipv6_txoptions * ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) { struct ipv6_txoptions *opt2; opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC); if (opt2) { long dif = (char *)opt2 - (char *)opt; memcpy(opt2, opt, opt->tot_len); if (opt2->hopopt) *((char **)&opt2->hopopt) += dif; if (opt2->dst0opt) *((char **)&opt2->dst0opt) += dif; if (opt2->dst1opt) *((char **)&opt2->dst1opt) += dif; if (opt2->srcrt) *((char **)&opt2->srcrt) += dif; refcount_set(&opt2->refcnt, 1); } return opt2; } EXPORT_SYMBOL_GPL(ipv6_dup_options); static void ipv6_renew_option(int renewtype, struct ipv6_opt_hdr **dest, struct ipv6_opt_hdr *old, struct ipv6_opt_hdr *new, int newtype, char **p) { struct ipv6_opt_hdr *src; src = (renewtype == newtype ? new : old); if (!src) return; memcpy(*p, src, ipv6_optlen(src)); *dest = (struct ipv6_opt_hdr *)*p; *p += CMSG_ALIGN(ipv6_optlen(*dest)); } /** * ipv6_renew_options - replace a specific ext hdr with a new one. * * @sk: sock from which to allocate memory * @opt: original options * @newtype: option type to replace in @opt * @newopt: new option of type @newtype to replace (user-mem) * * Returns a new set of options which is a copy of @opt with the * option type @newtype replaced with @newopt. * * @opt may be NULL, in which case a new set of options is returned * containing just @newopt. * * @newopt may be NULL, in which case the specified option type is * not copied into the new set of options. * * The new set of options is allocated from the socket option memory * buffer of @sk. */ struct ipv6_txoptions * ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, struct ipv6_opt_hdr *newopt) { int tot_len = 0; char *p; struct ipv6_txoptions *opt2; if (opt) { if (newtype != IPV6_HOPOPTS && opt->hopopt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->hopopt)); if (newtype != IPV6_RTHDRDSTOPTS && opt->dst0opt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst0opt)); if (newtype != IPV6_RTHDR && opt->srcrt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->srcrt)); if (newtype != IPV6_DSTOPTS && opt->dst1opt) tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt)); } if (newopt) tot_len += CMSG_ALIGN(ipv6_optlen(newopt)); if (!tot_len) return NULL; tot_len += sizeof(*opt2); opt2 = sock_kmalloc(sk, tot_len, GFP_ATOMIC); if (!opt2) return ERR_PTR(-ENOBUFS); memset(opt2, 0, tot_len); refcount_set(&opt2->refcnt, 1); opt2->tot_len = tot_len; p = (char *)(opt2 + 1); ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt, (opt ? opt->hopopt : NULL), newopt, newtype, &p); ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt, (opt ? opt->dst0opt : NULL), newopt, newtype, &p); ipv6_renew_option(IPV6_RTHDR, (struct ipv6_opt_hdr **)&opt2->srcrt, (opt ? (struct ipv6_opt_hdr *)opt->srcrt : NULL), newopt, newtype, &p); ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt, (opt ? opt->dst1opt : NULL), newopt, newtype, &p); opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) + (opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) + (opt2->srcrt ? ipv6_optlen(opt2->srcrt) : 0); opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0); return opt2; } struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt) { /* * ignore the dest before srcrt unless srcrt is being included. * --yoshfuji */ if (opt->dst0opt && !opt->srcrt) { if (opt_space != opt) { memcpy(opt_space, opt, sizeof(*opt_space)); opt = opt_space; } opt->opt_nflen -= ipv6_optlen(opt->dst0opt); opt->dst0opt = NULL; } return opt; } EXPORT_SYMBOL_GPL(__ipv6_fixup_options); /** * fl6_update_dst - update flowi destination address with info given * by srcrt option, if any. * * @fl6: flowi6 for which daddr is to be updated * @opt: struct ipv6_txoptions in which to look for srcrt opt * @orig: copy of original daddr address if modified * * Returns NULL if no txoptions or no srcrt, otherwise returns orig * and initial value of fl6->daddr set in orig */ struct in6_addr *fl6_update_dst(struct flowi6 *fl6, const struct ipv6_txoptions *opt, struct in6_addr *orig) { if (!opt || !opt->srcrt) return NULL; *orig = fl6->daddr; switch (opt->srcrt->type) { case IPV6_SRCRT_TYPE_0: case IPV6_SRCRT_STRICT: case IPV6_SRCRT_TYPE_2: fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr; break; case IPV6_SRCRT_TYPE_4: { struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt; fl6->daddr = srh->segments[srh->segments_left]; break; } default: return NULL; } return orig; } EXPORT_SYMBOL_GPL(fl6_update_dst);
4701 4575 398 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef MM_SLAB_H #define MM_SLAB_H /* * Internal slab definitions */ void __init kmem_cache_init(void); #ifdef CONFIG_64BIT # ifdef system_has_cmpxchg128 # define system_has_freelist_aba() system_has_cmpxchg128() # define try_cmpxchg_freelist try_cmpxchg128 # endif #define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128 typedef u128 freelist_full_t; #else /* CONFIG_64BIT */ # ifdef system_has_cmpxchg64 # define system_has_freelist_aba() system_has_cmpxchg64() # define try_cmpxchg_freelist try_cmpxchg64 # endif #define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64 typedef u64 freelist_full_t; #endif /* CONFIG_64BIT */ #if defined(system_has_freelist_aba) && !defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) #undef system_has_freelist_aba #endif /* * Freelist pointer and counter to cmpxchg together, avoids the typical ABA * problems with cmpxchg of just a pointer. */ typedef union { struct { void *freelist; unsigned long counter; }; freelist_full_t full; } freelist_aba_t; /* Reuses the bits in struct page */ struct slab { unsigned long __page_flags; #if defined(CONFIG_SLAB) struct kmem_cache *slab_cache; union { struct { struct list_head slab_list; void *freelist; /* array of free object indexes */ void *s_mem; /* first object */ }; struct rcu_head rcu_head; }; unsigned int active; #elif defined(CONFIG_SLUB) struct kmem_cache *slab_cache; union { struct { union { struct list_head slab_list; #ifdef CONFIG_SLUB_CPU_PARTIAL struct { struct slab *next; int slabs; /* Nr of slabs left */ }; #endif }; /* Double-word boundary */ union { struct { void *freelist; /* first free object */ union { unsigned long counters; struct { unsigned inuse:16; unsigned objects:15; unsigned frozen:1; }; }; }; #ifdef system_has_freelist_aba freelist_aba_t freelist_counter; #endif }; }; struct rcu_head rcu_head; }; unsigned int __unused; #else #error "Unexpected slab allocator configured" #endif atomic_t __page_refcount; #ifdef CONFIG_MEMCG unsigned long memcg_data; #endif }; #define SLAB_MATCH(pg, sl) \ static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) SLAB_MATCH(flags, __page_flags); SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ SLAB_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG SLAB_MATCH(memcg_data, memcg_data); #endif #undef SLAB_MATCH static_assert(sizeof(struct slab) <= sizeof(struct page)); #if defined(system_has_freelist_aba) && defined(CONFIG_SLUB) static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t))); #endif /** * folio_slab - Converts from folio to slab. * @folio: The folio. * * Currently struct slab is a different representation of a folio where * folio_test_slab() is true. * * Return: The slab which contains this folio. */ #define folio_slab(folio) (_Generic((folio), \ const struct folio *: (const struct slab *)(folio), \ struct folio *: (struct slab *)(folio))) /** * slab_folio - The folio allocated for a slab * @slab: The slab. * * Slabs are allocated as folios that contain the individual objects and are * using some fields in the first struct page of the folio - those fields are * now accessed by struct slab. It is occasionally necessary to convert back to * a folio in order to communicate with the rest of the mm. Please use this * helper function instead of casting yourself, as the implementation may change * in the future. */ #define slab_folio(s) (_Generic((s), \ const struct slab *: (const struct folio *)s, \ struct slab *: (struct folio *)s)) /** * page_slab - Converts from first struct page to slab. * @p: The first (either head of compound or single) page of slab. * * A temporary wrapper to convert struct page to struct slab in situations where * we know the page is the compound head, or single order-0 page. * * Long-term ideally everything would work with struct slab directly or go * through folio to struct slab. * * Return: The slab which contains this page */ #define page_slab(p) (_Generic((p), \ const struct page *: (const struct slab *)(p), \ struct page *: (struct slab *)(p))) /** * slab_page - The first struct page allocated for a slab * @slab: The slab. * * A convenience wrapper for converting slab to the first struct page of the * underlying folio, to communicate with code not yet converted to folio or * struct slab. */ #define slab_page(s) folio_page(slab_folio(s), 0) /* * If network-based swap is enabled, sl*b must keep track of whether pages * were allocated from pfmemalloc reserves. */ static inline bool slab_test_pfmemalloc(const struct slab *slab) { return folio_test_active((struct folio *)slab_folio(slab)); } static inline void slab_set_pfmemalloc(struct slab *slab) { folio_set_active(slab_folio(slab)); } static inline void slab_clear_pfmemalloc(struct slab *slab) { folio_clear_active(slab_folio(slab)); } static inline void __slab_clear_pfmemalloc(struct slab *slab) { __folio_clear_active(slab_folio(slab)); } static inline void *slab_address(const struct slab *slab) { return folio_address(slab_folio(slab)); } static inline int slab_nid(const struct slab *slab) { return folio_nid(slab_folio(slab)); } static inline pg_data_t *slab_pgdat(const struct slab *slab) { return folio_pgdat(slab_folio(slab)); } static inline struct slab *virt_to_slab(const void *addr) { struct folio *folio = virt_to_folio(addr); if (!folio_test_slab(folio)) return NULL; return folio_slab(folio); } static inline int slab_order(const struct slab *slab) { return folio_order((struct folio *)slab_folio(slab)); } static inline size_t slab_size(const struct slab *slab) { return PAGE_SIZE << slab_order(slab); } #ifdef CONFIG_SLAB #include <linux/slab_def.h> #endif #ifdef CONFIG_SLUB #include <linux/slub_def.h> #endif #include <linux/memcontrol.h> #include <linux/fault-inject.h> #include <linux/kasan.h> #include <linux/kmemleak.h> #include <linux/random.h> #include <linux/sched/mm.h> #include <linux/list_lru.h> /* * State of the slab allocator. * * This is used to describe the states of the allocator during bootup. * Allocators use this to gradually bootstrap themselves. Most allocators * have the problem that the structures used for managing slab caches are * allocated from slab caches themselves. */ enum slab_state { DOWN, /* No slab functionality yet */ PARTIAL, /* SLUB: kmem_cache_node available */ PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ UP, /* Slab caches usable but not all extras yet */ FULL /* Everything is working */ }; extern enum slab_state slab_state; /* The slab cache mutex protects the management structures during changes */ extern struct mutex slab_mutex; /* The list of all slab caches on the system */ extern struct list_head slab_caches; /* The slab cache that manages slab cache information */ extern struct kmem_cache *kmem_cache; /* A table of kmalloc cache names and sizes */ extern const struct kmalloc_info_struct { const char *name[NR_KMALLOC_TYPES]; unsigned int size; } kmalloc_info[]; /* Kmalloc array related functions */ void setup_kmalloc_cache_index_table(void); void create_kmalloc_caches(slab_flags_t); /* Find the kmalloc slab corresponding for a certain size */ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller); void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, size_t orig_size, unsigned long caller); void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller); gfp_t kmalloc_fix_flags(gfp_t flags); /* Functions provided by the slab allocators */ int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags); extern void create_boot_cache(struct kmem_cache *, const char *name, unsigned int size, slab_flags_t flags, unsigned int useroffset, unsigned int usersize); int slab_unmergeable(struct kmem_cache *s); struct kmem_cache *find_mergeable(unsigned size, unsigned align, slab_flags_t flags, const char *name, void (*ctor)(void *)); struct kmem_cache * __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name); static inline bool is_kmalloc_cache(struct kmem_cache *s) { return (s->flags & SLAB_KMALLOC); } /* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_PANIC | \ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) #if defined(CONFIG_DEBUG_SLAB) #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) #elif defined(CONFIG_SLUB_DEBUG) #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) #else #define SLAB_DEBUG_FLAGS (0) #endif #if defined(CONFIG_SLAB) #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ SLAB_ACCOUNT | SLAB_NO_MERGE) #elif defined(CONFIG_SLUB) #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | SLAB_ACCOUNT | \ SLAB_NO_USER_FLAGS | SLAB_KMALLOC | SLAB_NO_MERGE) #else #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE) #endif /* Common flags available with current configuration */ #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) /* Common flags permitted for kmem_cache_create */ #define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ SLAB_RED_ZONE | \ SLAB_POISON | \ SLAB_STORE_USER | \ SLAB_TRACE | \ SLAB_CONSISTENCY_CHECKS | \ SLAB_MEM_SPREAD | \ SLAB_NOLEAKTRACE | \ SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | \ SLAB_ACCOUNT | \ SLAB_KMALLOC | \ SLAB_NO_MERGE | \ SLAB_NO_USER_FLAGS) bool __kmem_cache_empty(struct kmem_cache *); int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); int __kmem_cache_shrink(struct kmem_cache *); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; struct file; struct slabinfo { unsigned long active_objs; unsigned long num_objs; unsigned long active_slabs; unsigned long num_slabs; unsigned long shared_avail; unsigned int limit; unsigned int batchcount; unsigned int shared; unsigned int objects_per_slab; unsigned int cache_order; }; void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo); void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) { return (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; } #ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON DECLARE_STATIC_KEY_TRUE(slub_debug_enabled); #else DECLARE_STATIC_KEY_FALSE(slub_debug_enabled); #endif extern void print_tracking(struct kmem_cache *s, void *object); long validate_slab_cache(struct kmem_cache *s); static inline bool __slub_debug_enabled(void) { return static_branch_unlikely(&slub_debug_enabled); } #else static inline void print_tracking(struct kmem_cache *s, void *object) { } static inline bool __slub_debug_enabled(void) { return false; } #endif /* * Returns true if any of the specified slub_debug flags is enabled for the * cache. Use only for flags parsed by setup_slub_debug() as it also enables * the static key. */ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags) { if (IS_ENABLED(CONFIG_SLUB_DEBUG)) VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS)); if (__slub_debug_enabled()) return s->flags & flags; return false; } #ifdef CONFIG_MEMCG_KMEM /* * slab_objcgs - get the object cgroups vector associated with a slab * @slab: a pointer to the slab struct * * Returns a pointer to the object cgroups vector associated with the slab, * or NULL if no such vector has been associated yet. */ static inline struct obj_cgroup **slab_objcgs(struct slab *slab) { unsigned long memcg_data = READ_ONCE(slab->memcg_data); VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), slab_page(slab)); VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab)); return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab); void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr); static inline void memcg_free_slab_cgroups(struct slab *slab) { kfree(slab_objcgs(slab)); slab->memcg_data = 0; } static inline size_t obj_full_size(struct kmem_cache *s) { /* * For each accounted object there is an extra space which is used * to store obj_cgroup membership. Charge it too. */ return s->size + sizeof(struct obj_cgroup *); } /* * Returns false if the allocation should fail. */ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, struct list_lru *lru, struct obj_cgroup **objcgp, size_t objects, gfp_t flags) { struct obj_cgroup *objcg; if (!memcg_kmem_online()) return true; if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) return true; objcg = get_obj_cgroup_from_current(); if (!objcg) return true; if (lru) { int ret; struct mem_cgroup *memcg; memcg = get_mem_cgroup_from_objcg(objcg); ret = memcg_list_lru_alloc(memcg, lru, flags); css_put(&memcg->css); if (ret) goto out; } if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) goto out; *objcgp = objcg; return true; out: obj_cgroup_put(objcg); return false; } static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, gfp_t flags, size_t size, void **p) { struct slab *slab; unsigned long off; size_t i; if (!memcg_kmem_online() || !objcg) return; for (i = 0; i < size; i++) { if (likely(p[i])) { slab = virt_to_slab(p[i]); if (!slab_objcgs(slab) && memcg_alloc_slab_cgroups(slab, s, flags, false)) { obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; } off = obj_to_index(s, slab, p[i]); obj_cgroup_get(objcg); slab_objcgs(slab)[off] = objcg; mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), obj_full_size(s)); } else { obj_cgroup_uncharge(objcg, obj_full_size(s)); } } obj_cgroup_put(objcg); } static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { struct obj_cgroup **objcgs; int i; if (!memcg_kmem_online()) return; objcgs = slab_objcgs(slab); if (!objcgs) return; for (i = 0; i < objects; i++) { struct obj_cgroup *objcg; unsigned int off; off = obj_to_index(s, slab, p[i]); objcg = objcgs[off]; if (!objcg) continue; objcgs[off] = NULL; obj_cgroup_uncharge(objcg, obj_full_size(s)); mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), -obj_full_size(s)); obj_cgroup_put(objcg); } } #else /* CONFIG_MEMCG_KMEM */ static inline struct obj_cgroup **slab_objcgs(struct slab *slab) { return NULL; } static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) { return NULL; } static inline int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { return 0; } static inline void memcg_free_slab_cgroups(struct slab *slab) { } static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, struct list_lru *lru, struct obj_cgroup **objcgp, size_t objects, gfp_t flags) { return true; } static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, gfp_t flags, size_t size, void **p) { } static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { } #endif /* CONFIG_MEMCG_KMEM */ static inline struct kmem_cache *virt_to_cache(const void *obj) { struct slab *slab; slab = virt_to_slab(obj); if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__)) return NULL; return slab->slab_cache; } static __always_inline void account_slab(struct slab *slab, int order, struct kmem_cache *s, gfp_t gfp) { if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) memcg_alloc_slab_cgroups(slab, s, gfp, true); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), PAGE_SIZE << order); } static __always_inline void unaccount_slab(struct slab *slab, int order, struct kmem_cache *s) { if (memcg_kmem_online()) memcg_free_slab_cgroups(slab); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), -(PAGE_SIZE << order)); } static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { struct kmem_cache *cachep; if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) return s; cachep = virt_to_cache(x); if (WARN(cachep && cachep != s, "%s: Wrong slab cache. %s but object is from %s\n", __func__, s->name, cachep->name)) print_tracking(cachep, x); return cachep; } void free_large_kmalloc(struct folio *folio, void *object); size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) { #ifndef CONFIG_SLUB return s->object_size; #else /* CONFIG_SLUB */ # ifdef CONFIG_SLUB_DEBUG /* * Debugging requires use of the padding between object * and whatever may come after it. */ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) return s->object_size; # endif if (s->flags & SLAB_KASAN) return s->object_size; /* * If we have the need to store the freelist pointer * back there or track user information then we can * only use the space before that information. */ if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) return s->inuse; /* * Else we can use all the padding etc for the allocation */ return s->size; #endif } static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, struct list_lru *lru, struct obj_cgroup **objcgp, size_t size, gfp_t flags) { flags &= gfp_allowed_mask; might_alloc(flags); if (should_failslab(s, flags)) return NULL; if (!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)) return NULL; return s; } static inline void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, gfp_t flags, size_t size, void **p, bool init, unsigned int orig_size) { unsigned int zero_size = s->object_size; bool kasan_init = init; size_t i; flags &= gfp_allowed_mask; /* * For kmalloc object, the allocated memory size(object_size) is likely * larger than the requested size(orig_size). If redzone check is * enabled for the extra space, don't zero it, as it will be redzoned * soon. The redzone operation for this extra space could be seen as a * replacement of current poisoning under certain debug option, and * won't break other sanity checks. */ if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) && (s->flags & SLAB_KMALLOC)) zero_size = orig_size; /* * When slub_debug is enabled, avoid memory initialization integrated * into KASAN and instead zero out the memory via the memset below with * the proper size. Otherwise, KASAN might overwrite SLUB redzones and * cause false-positive reports. This does not lead to a performance * penalty on production builds, as slub_debug is not intended to be * enabled there. */ if (__slub_debug_enabled()) kasan_init = false; /* * As memory initialization might be integrated into KASAN, * kasan_slab_alloc and initialization memset must be * kept together to avoid discrepancies in behavior. * * As p[i] might get tagged, memset and kmemleak hook come after KASAN. */ for (i = 0; i < size; i++) { p[i] = kasan_slab_alloc(s, p[i], flags, kasan_init); if (p[i] && init && (!kasan_init || !kasan_has_integrated_init())) memset(p[i], 0, zero_size); kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); kmsan_slab_alloc(s, p[i], flags); } memcg_slab_post_alloc_hook(s, objcg, flags, size, p); } /* * The slab lists for all objects. */ struct kmem_cache_node { #ifdef CONFIG_SLAB raw_spinlock_t list_lock; struct list_head slabs_partial; /* partial list first, better asm code */ struct list_head slabs_full; struct list_head slabs_free; unsigned long total_slabs; /* length of all slab lists */ unsigned long free_slabs; /* length of free slab list only */ unsigned long free_objects; unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ struct array_cache *shared; /* shared per node */ struct alien_cache **alien; /* on other nodes */ unsigned long next_reap; /* updated without locking */ int free_touched; /* updated without locking */ #endif #ifdef CONFIG_SLUB spinlock_t list_lock; unsigned long nr_partial; struct list_head partial; #ifdef CONFIG_SLUB_DEBUG atomic_long_t nr_slabs; atomic_long_t total_objects; struct list_head full; #endif #endif }; static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) { return s->node[node]; } /* * Iterator over all nodes. The body will be executed for each node that has * a kmem_cache_node structure allocated (which is true for all online nodes) */ #define for_each_kmem_cache_node(__s, __node, __n) \ for (__node = 0; __node < nr_node_ids; __node++) \ if ((__n = get_node(__s, __node))) #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) void dump_unreclaimable_slab(void); #else static inline void dump_unreclaimable_slab(void) { } #endif void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); #ifdef CONFIG_SLAB_FREELIST_RANDOM int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, gfp_t gfp); void cache_random_seq_destroy(struct kmem_cache *cachep); #else static inline int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, gfp_t gfp) { return 0; } static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) { if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, &init_on_alloc)) { if (c->ctor) return false; if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) return flags & __GFP_ZERO; return true; } return flags & __GFP_ZERO; } static inline bool slab_want_init_on_free(struct kmem_cache *c) { if (static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, &init_on_free)) return !(c->ctor || (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); return false; } #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG) void debugfs_slab_release(struct kmem_cache *); #else static inline void debugfs_slab_release(struct kmem_cache *s) { } #endif #ifdef CONFIG_PRINTK #define KS_ADDRS_COUNT 16 struct kmem_obj_info { void *kp_ptr; struct slab *kp_slab; void *kp_objp; unsigned long kp_data_offset; struct kmem_cache *kp_slab_cache; void *kp_ret; void *kp_stack[KS_ADDRS_COUNT]; void *kp_free_stack[KS_ADDRS_COUNT]; }; void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab); #endif void __check_heap_object(const void *ptr, unsigned long n, const struct slab *slab, bool to_user); #ifdef CONFIG_SLUB_DEBUG void skip_orig_size_check(struct kmem_cache *s, const void *object); #endif #endif /* MM_SLAB_H */
101 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp tsn mapping array. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Jon Grimm <jgrimm@us.ibm.com> * Karl Knutson <karl@athena.chicago.il.us> * Sridhar Samudrala <sri@us.ibm.com> */ #include <linux/slab.h> #include <linux/types.h> #include <linux/bitmap.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> static void sctp_tsnmap_update(struct sctp_tsnmap *map); static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off, __u16 len, __u16 *start, __u16 *end); static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 size); /* Initialize a block of memory as a tsnmap. */ struct sctp_tsnmap *sctp_tsnmap_init(struct sctp_tsnmap *map, __u16 len, __u32 initial_tsn, gfp_t gfp) { if (!map->tsn_map) { map->tsn_map = kzalloc(len>>3, gfp); if (map->tsn_map == NULL) return NULL; map->len = len; } else { bitmap_zero(map->tsn_map, map->len); } /* Keep track of TSNs represented by tsn_map. */ map->base_tsn = initial_tsn; map->cumulative_tsn_ack_point = initial_tsn - 1; map->max_tsn_seen = map->cumulative_tsn_ack_point; map->num_dup_tsns = 0; return map; } void sctp_tsnmap_free(struct sctp_tsnmap *map) { map->len = 0; kfree(map->tsn_map); } /* Test the tracking state of this TSN. * Returns: * 0 if the TSN has not yet been seen * >0 if the TSN has been seen (duplicate) * <0 if the TSN is invalid (too large to track) */ int sctp_tsnmap_check(const struct sctp_tsnmap *map, __u32 tsn) { u32 gap; /* Check to see if this is an old TSN */ if (TSN_lte(tsn, map->cumulative_tsn_ack_point)) return 1; /* Verify that we can hold this TSN and that it will not * overflow our map */ if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE)) return -1; /* Calculate the index into the mapping arrays. */ gap = tsn - map->base_tsn; /* Check to see if TSN has already been recorded. */ if (gap < map->len && test_bit(gap, map->tsn_map)) return 1; else return 0; } /* Mark this TSN as seen. */ int sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn, struct sctp_transport *trans) { u16 gap; if (TSN_lt(tsn, map->base_tsn)) return 0; gap = tsn - map->base_tsn; if (gap >= map->len && !sctp_tsnmap_grow(map, gap + 1)) return -ENOMEM; if (!sctp_tsnmap_has_gap(map) && gap == 0) { /* In this case the map has no gaps and the tsn we are * recording is the next expected tsn. We don't touch * the map but simply bump the values. */ map->max_tsn_seen++; map->cumulative_tsn_ack_point++; if (trans) trans->sack_generation = trans->asoc->peer.sack_generation; map->base_tsn++; } else { /* Either we already have a gap, or about to record a gap, so * have work to do. * * Bump the max. */ if (TSN_lt(map->max_tsn_seen, tsn)) map->max_tsn_seen = tsn; /* Mark the TSN as received. */ set_bit(gap, map->tsn_map); /* Go fixup any internal TSN mapping variables including * cumulative_tsn_ack_point. */ sctp_tsnmap_update(map); } return 0; } /* Initialize a Gap Ack Block iterator from memory being provided. */ static void sctp_tsnmap_iter_init(const struct sctp_tsnmap *map, struct sctp_tsnmap_iter *iter) { /* Only start looking one past the Cumulative TSN Ack Point. */ iter->start = map->cumulative_tsn_ack_point + 1; } /* Get the next Gap Ack Blocks. Returns 0 if there was not another block * to get. */ static int sctp_tsnmap_next_gap_ack(const struct sctp_tsnmap *map, struct sctp_tsnmap_iter *iter, __u16 *start, __u16 *end) { int ended = 0; __u16 start_ = 0, end_ = 0, offset; /* If there are no more gap acks possible, get out fast. */ if (TSN_lte(map->max_tsn_seen, iter->start)) return 0; offset = iter->start - map->base_tsn; sctp_tsnmap_find_gap_ack(map->tsn_map, offset, map->len, &start_, &end_); /* The Gap Ack Block happens to end at the end of the map. */ if (start_ && !end_) end_ = map->len - 1; /* If we found a Gap Ack Block, return the start and end and * bump the iterator forward. */ if (end_) { /* Fix up the start and end based on the * Cumulative TSN Ack which is always 1 behind base. */ *start = start_ + 1; *end = end_ + 1; /* Move the iterator forward. */ iter->start = map->cumulative_tsn_ack_point + *end + 1; ended = 1; } return ended; } /* Mark this and any lower TSN as seen. */ void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn) { u32 gap; if (TSN_lt(tsn, map->base_tsn)) return; if (!TSN_lt(tsn, map->base_tsn + SCTP_TSN_MAP_SIZE)) return; /* Bump the max. */ if (TSN_lt(map->max_tsn_seen, tsn)) map->max_tsn_seen = tsn; gap = tsn - map->base_tsn + 1; map->base_tsn += gap; map->cumulative_tsn_ack_point += gap; if (gap >= map->len) { /* If our gap is larger then the map size, just * zero out the map. */ bitmap_zero(map->tsn_map, map->len); } else { /* If the gap is smaller than the map size, * shift the map by 'gap' bits and update further. */ bitmap_shift_right(map->tsn_map, map->tsn_map, gap, map->len); sctp_tsnmap_update(map); } } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* This private helper function updates the tsnmap buffers and * the Cumulative TSN Ack Point. */ static void sctp_tsnmap_update(struct sctp_tsnmap *map) { u16 len; unsigned long zero_bit; len = map->max_tsn_seen - map->cumulative_tsn_ack_point; zero_bit = find_first_zero_bit(map->tsn_map, len); if (!zero_bit) return; /* The first 0-bit is bit 0. nothing to do */ map->base_tsn += zero_bit; map->cumulative_tsn_ack_point += zero_bit; bitmap_shift_right(map->tsn_map, map->tsn_map, zero_bit, map->len); } /* How many data chunks are we missing from our peer? */ __u16 sctp_tsnmap_pending(struct sctp_tsnmap *map) { __u32 cum_tsn = map->cumulative_tsn_ack_point; __u32 max_tsn = map->max_tsn_seen; __u32 base_tsn = map->base_tsn; __u16 pending_data; u32 gap; pending_data = max_tsn - cum_tsn; gap = max_tsn - base_tsn; if (gap == 0 || gap >= map->len) goto out; pending_data -= bitmap_weight(map->tsn_map, gap + 1); out: return pending_data; } /* This is a private helper for finding Gap Ack Blocks. It searches a * single array for the start and end of a Gap Ack Block. * * The flags "started" and "ended" tell is if we found the beginning * or (respectively) the end of a Gap Ack Block. */ static void sctp_tsnmap_find_gap_ack(unsigned long *map, __u16 off, __u16 len, __u16 *start, __u16 *end) { int i = off; /* Look through the entire array, but break out * early if we have found the end of the Gap Ack Block. */ /* Also, stop looking past the maximum TSN seen. */ /* Look for the start. */ i = find_next_bit(map, len, off); if (i < len) *start = i; /* Look for the end. */ if (*start) { /* We have found the start, let's find the * end. If we find the end, break out. */ i = find_next_zero_bit(map, len, i); if (i < len) *end = i - 1; } } /* Renege that we have seen a TSN. */ void sctp_tsnmap_renege(struct sctp_tsnmap *map, __u32 tsn) { u32 gap; if (TSN_lt(tsn, map->base_tsn)) return; /* Assert: TSN is in range. */ if (!TSN_lt(tsn, map->base_tsn + map->len)) return; gap = tsn - map->base_tsn; /* Pretend we never saw the TSN. */ clear_bit(gap, map->tsn_map); } /* How many gap ack blocks do we have recorded? */ __u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map, struct sctp_gap_ack_block *gabs) { struct sctp_tsnmap_iter iter; int ngaps = 0; /* Refresh the gap ack information. */ if (sctp_tsnmap_has_gap(map)) { __u16 start = 0, end = 0; sctp_tsnmap_iter_init(map, &iter); while (sctp_tsnmap_next_gap_ack(map, &iter, &start, &end)) { gabs[ngaps].start = htons(start); gabs[ngaps].end = htons(end); ngaps++; if (ngaps >= SCTP_MAX_GABS) break; } } return ngaps; } static int sctp_tsnmap_grow(struct sctp_tsnmap *map, u16 size) { unsigned long *new; unsigned long inc; u16 len; if (size > SCTP_TSN_MAP_SIZE) return 0; inc = ALIGN((size - map->len), BITS_PER_LONG) + SCTP_TSN_MAP_INCREMENT; len = min_t(u16, map->len + inc, SCTP_TSN_MAP_SIZE); new = kzalloc(len>>3, GFP_ATOMIC); if (!new) return 0; bitmap_copy(new, map->tsn_map, map->max_tsn_seen - map->cumulative_tsn_ack_point); kfree(map->tsn_map); map->tsn_map = new; map->len = len; return 1; }
4537 352 5834 1223 2279 7010 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ATOMIC_H #define _ASM_X86_ATOMIC_H #include <linux/compiler.h> #include <linux/types.h> #include <asm/alternative.h> #include <asm/cmpxchg.h> #include <asm/rmwcc.h> #include <asm/barrier.h> /* * Atomic operations that C can't guarantee us. Useful for * resource counting etc.. */ static __always_inline int arch_atomic_read(const atomic_t *v) { /* * Note for KASAN: we deliberately don't use READ_ONCE_NOCHECK() here, * it's non-inlined function that increases binary size and stack usage. */ return __READ_ONCE((v)->counter); } static __always_inline void arch_atomic_set(atomic_t *v, int i) { __WRITE_ONCE(v->counter, i); } static __always_inline void arch_atomic_add(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "addl %1,%0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline void arch_atomic_sub(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "subl %1,%0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i); } #define arch_atomic_sub_and_test arch_atomic_sub_and_test static __always_inline void arch_atomic_inc(atomic_t *v) { asm volatile(LOCK_PREFIX "incl %0" : "+m" (v->counter) :: "memory"); } #define arch_atomic_inc arch_atomic_inc static __always_inline void arch_atomic_dec(atomic_t *v) { asm volatile(LOCK_PREFIX "decl %0" : "+m" (v->counter) :: "memory"); } #define arch_atomic_dec arch_atomic_dec static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e); } #define arch_atomic_dec_and_test arch_atomic_dec_and_test static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e); } #define arch_atomic_inc_and_test arch_atomic_inc_and_test static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i); } #define arch_atomic_add_negative arch_atomic_add_negative static __always_inline int arch_atomic_add_return(int i, atomic_t *v) { return i + xadd(&v->counter, i); } #define arch_atomic_add_return arch_atomic_add_return static __always_inline int arch_atomic_sub_return(int i, atomic_t *v) { return arch_atomic_add_return(-i, v); } #define arch_atomic_sub_return arch_atomic_sub_return static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { return xadd(&v->counter, i); } #define arch_atomic_fetch_add arch_atomic_fetch_add static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v) { return xadd(&v->counter, -i); } #define arch_atomic_fetch_sub arch_atomic_fetch_sub static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { return arch_cmpxchg(&v->counter, old, new); } #define arch_atomic_cmpxchg arch_atomic_cmpxchg static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new) { return arch_try_cmpxchg(&v->counter, old, new); } #define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg static __always_inline int arch_atomic_xchg(atomic_t *v, int new) { return arch_xchg(&v->counter, new); } #define arch_atomic_xchg arch_atomic_xchg static __always_inline void arch_atomic_and(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "andl %1,%0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v) { int val = arch_atomic_read(v); do { } while (!arch_atomic_try_cmpxchg(v, &val, val & i)); return val; } #define arch_atomic_fetch_and arch_atomic_fetch_and static __always_inline void arch_atomic_or(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "orl %1,%0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v) { int val = arch_atomic_read(v); do { } while (!arch_atomic_try_cmpxchg(v, &val, val | i)); return val; } #define arch_atomic_fetch_or arch_atomic_fetch_or static __always_inline void arch_atomic_xor(int i, atomic_t *v) { asm volatile(LOCK_PREFIX "xorl %1,%0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v) { int val = arch_atomic_read(v); do { } while (!arch_atomic_try_cmpxchg(v, &val, val ^ i)); return val; } #define arch_atomic_fetch_xor arch_atomic_fetch_xor #ifdef CONFIG_X86_32 # include <asm/atomic64_32.h> #else # include <asm/atomic64_64.h> #endif #endif /* _ASM_X86_ATOMIC_H */
1262 9 57 60 864 58 809 838 56 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 // SPDX-License-Identifier: GPL-2.0 /* * hrtimers - High-resolution kernel timers * * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar * * data type definitions, declarations, prototypes * * Started by: Thomas Gleixner and Ingo Molnar */ #ifndef _LINUX_HRTIMER_H #define _LINUX_HRTIMER_H #include <linux/hrtimer_defs.h> #include <linux/rbtree.h> #include <linux/init.h> #include <linux/list.h> #include <linux/percpu.h> #include <linux/seqlock.h> #include <linux/timer.h> #include <linux/timerqueue.h> struct hrtimer_clock_base; struct hrtimer_cpu_base; /* * Mode arguments of xxx_hrtimer functions: * * HRTIMER_MODE_ABS - Time value is absolute * HRTIMER_MODE_REL - Time value is relative to now * HRTIMER_MODE_PINNED - Timer is bound to CPU (is only considered * when starting the timer) * HRTIMER_MODE_SOFT - Timer callback function will be executed in * soft irq context * HRTIMER_MODE_HARD - Timer callback function will be executed in * hard irq context even on PREEMPT_RT. */ enum hrtimer_mode { HRTIMER_MODE_ABS = 0x00, HRTIMER_MODE_REL = 0x01, HRTIMER_MODE_PINNED = 0x02, HRTIMER_MODE_SOFT = 0x04, HRTIMER_MODE_HARD = 0x08, HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED, HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED, HRTIMER_MODE_ABS_SOFT = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT, HRTIMER_MODE_REL_SOFT = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT, HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT, HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT, HRTIMER_MODE_ABS_HARD = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD, HRTIMER_MODE_REL_HARD = HRTIMER_MODE_REL | HRTIMER_MODE_HARD, HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD, HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD, }; /* * Return values for the callback function */ enum hrtimer_restart { HRTIMER_NORESTART, /* Timer is not restarted */ HRTIMER_RESTART, /* Timer must be restarted */ }; /* * Values to track state of the timer * * Possible states: * * 0x00 inactive * 0x01 enqueued into rbtree * * The callback state is not part of the timer->state because clearing it would * mean touching the timer after the callback, this makes it impossible to free * the timer from the callback function. * * Therefore we track the callback state in: * * timer->base->cpu_base->running == timer * * On SMP it is possible to have a "callback function running and enqueued" * status. It happens for example when a posix timer expired and the callback * queued a signal. Between dropping the lock which protects the posix timer * and reacquiring the base lock of the hrtimer, another CPU can deliver the * signal and rearm the timer. * * All state transitions are protected by cpu_base->lock. */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 /** * struct hrtimer - the basic hrtimer structure * @node: timerqueue node, which also manages node.expires, * the absolute expiry time in the hrtimers internal * representation. The time is related to the clock on * which the timer is based. Is setup by adding * slack to the _softexpires value. For non range timers * identical to _softexpires. * @_softexpires: the absolute earliest expiry time of the hrtimer. * The time which was given as expiry time when the timer * was armed. * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) * @is_rel: Set if the timer was armed relative * @is_soft: Set if hrtimer will be expired in soft interrupt context. * @is_hard: Set if hrtimer will be expired in hard interrupt context * even on RT. * * The hrtimer structure must be initialized by hrtimer_init() */ struct hrtimer { struct timerqueue_node node; ktime_t _softexpires; enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; u8 state; u8 is_rel; u8 is_soft; u8 is_hard; }; /** * struct hrtimer_sleeper - simple sleeper structure * @timer: embedded timer structure * @task: task to wake up * * task is set to NULL, when the timer expires. */ struct hrtimer_sleeper { struct hrtimer timer; struct task_struct *task; }; #ifdef CONFIG_64BIT # define __hrtimer_clock_base_align ____cacheline_aligned #else # define __hrtimer_clock_base_align #endif /** * struct hrtimer_clock_base - the timer base for a specific clock * @cpu_base: per cpu clock base * @index: clock type index for per_cpu support when moving a * timer to a base on another cpu. * @clockid: clock id for per_cpu support * @seq: seqcount around __run_hrtimer * @running: pointer to the currently running hrtimer * @active: red black tree root node for the active timers * @get_time: function to retrieve the current time of the clock * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; unsigned int index; clockid_t clockid; seqcount_raw_spinlock_t seq; struct hrtimer *running; struct timerqueue_head active; ktime_t (*get_time)(void); ktime_t offset; } __hrtimer_clock_base_align; enum hrtimer_base_type { HRTIMER_BASE_MONOTONIC, HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, HRTIMER_BASE_TAI, HRTIMER_BASE_MONOTONIC_SOFT, HRTIMER_BASE_REALTIME_SOFT, HRTIMER_BASE_BOOTTIME_SOFT, HRTIMER_BASE_TAI_SOFT, HRTIMER_MAX_CLOCK_BASES, }; /** * struct hrtimer_cpu_base - the per cpu clock bases * @lock: lock protecting the base and associated clock bases * and timers * @cpu: cpu number * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events * @hres_active: State of high resolution mode * @in_hrtirq: hrtimer_interrupt() is currently executing * @hang_detected: The last hrtimer interrupt detected a hang * @softirq_activated: displays, if the softirq is raised - update of softirq * related settings is not required then. * @nr_events: Total number of hrtimer interrupt events * @nr_retries: Total number of hrtimer interrupt retries * @nr_hangs: Total number of hrtimer interrupt hangs * @max_hang_time: Maximum time spent in hrtimer_interrupt * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are * expired * @timer_waiters: A hrtimer_cancel() invocation waits for the timer * callback to finish. * @expires_next: absolute time of the next event, is required for remote * hrtimer enqueue; it is the total first expiry time (hard * and soft hrtimer are taken into account) * @next_timer: Pointer to the first expiring timer * @softirq_expires_next: Time to check, if soft queues needs also to be expired * @softirq_next_timer: Pointer to the first expiring softirq based timer * @clock_base: array of clock bases for this cpu * * Note: next_timer is just an optimization for __remove_hrtimer(). * Do not dereference the pointer because it is not reliable on * cross cpu removals. */ struct hrtimer_cpu_base { raw_spinlock_t lock; unsigned int cpu; unsigned int active_bases; unsigned int clock_was_set_seq; unsigned int hres_active : 1, in_hrtirq : 1, hang_detected : 1, softirq_activated : 1; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int nr_events; unsigned short nr_retries; unsigned short nr_hangs; unsigned int max_hang_time; #endif #ifdef CONFIG_PREEMPT_RT spinlock_t softirq_expiry_lock; atomic_t timer_waiters; #endif ktime_t expires_next; struct hrtimer *next_timer; ktime_t softirq_expires_next; struct hrtimer *softirq_next_timer; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; } ____cacheline_aligned; static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { timer->node.expires = time; timer->_softexpires = time; } static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, delta); } static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta)); } static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64) { timer->node.expires = tv64; timer->_softexpires = tv64; } static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time) { timer->node.expires = ktime_add_safe(timer->node.expires, time); timer->_softexpires = ktime_add_safe(timer->_softexpires, time); } static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns) { timer->node.expires = ktime_add_ns(timer->node.expires, ns); timer->_softexpires = ktime_add_ns(timer->_softexpires, ns); } static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer) { return timer->node.expires; } static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer) { return timer->_softexpires; } static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer) { return timer->node.expires; } static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer) { return timer->_softexpires; } static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) { return ktime_to_ns(timer->node.expires); } static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { return ktime_sub(timer->node.expires, timer->base->get_time()); } static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) { return timer->base->get_time(); } static inline int hrtimer_is_hres_active(struct hrtimer *timer) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? timer->base->cpu_base->hres_active : 0; } #ifdef CONFIG_HIGH_RES_TIMERS struct clock_event_device; extern void hrtimer_interrupt(struct clock_event_device *dev); extern unsigned int hrtimer_resolution; #else #define hrtimer_resolution (unsigned int)LOW_RES_NSEC #endif static inline ktime_t __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) { ktime_t rem = ktime_sub(timer->node.expires, now); /* * Adjust relative timers for the extra we added in * hrtimer_start_range_ns() to prevent short timeouts. */ if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel) rem -= hrtimer_resolution; return rem; } static inline ktime_t hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) { return __hrtimer_expires_remaining_adjusted(timer, timer->base->get_time()); } #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); extern void timerfd_resume(void); #else static inline void timerfd_clock_was_set(void) { } static inline void timerfd_resume(void) { } #endif DECLARE_PER_CPU(struct tick_device, tick_cpu_device); #ifdef CONFIG_PREEMPT_RT void hrtimer_cancel_wait_running(const struct hrtimer *timer); #else static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) { cpu_relax(); } #endif /* Exported timer functions: */ /* Initialize timers: */ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); extern void destroy_hrtimer_on_stack(struct hrtimer *timer); #else static inline void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode) { hrtimer_init(timer, which_clock, mode); } static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { hrtimer_init_sleeper(sl, clock_id, mode); } static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } #endif /* Basic timer operations: */ extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 range_ns, const enum hrtimer_mode mode); /** * hrtimer_start - (re)start an hrtimer * @timer: the timer to be added * @tim: expiry time * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { hrtimer_start_range_ns(timer, tim, 0, mode); } extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); static inline void hrtimer_start_expires(struct hrtimer *timer, enum hrtimer_mode mode) { u64 delta; ktime_t soft, hard; soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); delta = ktime_to_ns(ktime_sub(hard, soft)); hrtimer_start_range_ns(timer, soft, delta, mode); } void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode); static inline void hrtimer_restart(struct hrtimer *timer) { hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } /* Query timers: */ extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust); /** * hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read */ static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) { return __hrtimer_get_remaining(timer, false); } extern u64 hrtimer_get_next_event(void); extern u64 hrtimer_next_event_without(const struct hrtimer *exclude); extern bool hrtimer_active(const struct hrtimer *timer); /** * hrtimer_is_queued - check, whether the timer is on one of the queues * @timer: Timer to check * * Returns: True if the timer is queued, false otherwise * * The function can be used lockless, but it gives only a current snapshot. */ static inline bool hrtimer_is_queued(struct hrtimer *timer) { /* The READ_ONCE pairs with the update functions of timer->state */ return !!(READ_ONCE(timer->state) & HRTIMER_STATE_ENQUEUED); } /* * Helper function to check, whether the timer is running the callback * function */ static inline int hrtimer_callback_running(struct hrtimer *timer) { return timer->base->running == timer; } /* Forward a hrtimer so it expires after now: */ extern u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); /** * hrtimer_forward_now - forward the timer expiry so it expires after now * @timer: hrtimer to forward * @interval: the interval to forward * * Forward the timer expiry so it will expire after the current time * of the hrtimer clock base. Returns the number of overruns. * * Can be safely called from the callback function of @timer. If * called from other contexts @timer must neither be enqueued nor * running the callback and the caller needs to take care of * serialization. * * Note: This only updates the timer expiry value and does not requeue * the timer. */ static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) { return hrtimer_forward(timer, timer->base->get_time(), interval); } /* Precise sleep: */ extern int nanosleep_copyout(struct restart_block *, struct timespec64 *); extern long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid); extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode); extern int schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id); extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ extern void hrtimer_run_queues(void); /* Bootup initialization: */ extern void __init hrtimers_init(void); /* Show pending timers: */ extern void sysrq_timer_list_show(void); int hrtimers_prepare_cpu(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU int hrtimers_dead_cpu(unsigned int cpu); #else #define hrtimers_dead_cpu NULL #endif #endif
20 20 20 20 20 20 20 20 17 17 17 140 140 140 140 140 140 15 15 15 2 1 1 1 1 1 1 1 1 1 3 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 // SPDX-License-Identifier: GPL-2.0-or-later /* * Device handling code * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/netpoll.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/list.h> #include <linux/netfilter_bridge.h> #include <linux/uaccess.h> #include "br_private.h" #define COMMON_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | \ NETIF_F_GSO_MASK | NETIF_F_HW_CSUM) const struct nf_br_ops __rcu *nf_br_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_br_ops); /* net device transmit always called with BH disabled */ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { struct net_bridge_mcast_port *pmctx_null = NULL; struct net_bridge *br = netdev_priv(dev); struct net_bridge_mcast *brmctx = &br->multicast_ctx; struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; const struct nf_br_ops *nf_ops; u8 state = BR_STATE_FORWARDING; struct net_bridge_vlan *vlan; const unsigned char *dest; u16 vid = 0; memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); br_tc_skb_miss_set(skb, false); rcu_read_lock(); nf_ops = rcu_dereference(nf_br_ops); if (nf_ops && nf_ops->br_dev_xmit_hook(skb)) { rcu_read_unlock(); return NETDEV_TX_OK; } dev_sw_netstats_tx_add(dev, 1, skb->len); br_switchdev_frame_unmark(skb); BR_INPUT_SKB_CB(skb)->brdev = dev; BR_INPUT_SKB_CB(skb)->frag_max_size = 0; skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, &state, &vlan)) goto out; if (IS_ENABLED(CONFIG_INET) && (eth_hdr(skb)->h_proto == htons(ETH_P_ARP) || eth_hdr(skb)->h_proto == htons(ETH_P_RARP)) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) { br_do_proxy_suppress_arp(skb, br, vid, NULL); } else if (IS_ENABLED(CONFIG_IPV6) && skb->protocol == htons(ETH_P_IPV6) && br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) && pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)) && ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { struct nd_msg *msg, _msg; msg = br_is_nd_neigh_msg(skb, &_msg); if (msg) br_do_suppress_nd(skb, br, vid, NULL, msg); } dest = eth_hdr(skb)->h_dest; if (is_broadcast_ether_addr(dest)) { br_flood(br, skb, BR_PKT_BROADCAST, false, true, vid); } else if (is_multicast_ether_addr(dest)) { if (unlikely(netpoll_tx_running(dev))) { br_flood(br, skb, BR_PKT_MULTICAST, false, true, vid); goto out; } if (br_multicast_rcv(&brmctx, &pmctx_null, vlan, skb, vid)) { kfree_skb(skb); goto out; } mdst = br_mdb_get(brmctx, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) br_multicast_flood(mdst, skb, brmctx, false, true); else br_flood(br, skb, BR_PKT_MULTICAST, false, true, vid); } else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) { br_forward(dst->dst, skb, false, true); } else { br_flood(br, skb, BR_PKT_UNICAST, false, true, vid); } out: rcu_read_unlock(); return NETDEV_TX_OK; } static struct lock_class_key bridge_netdev_addr_lock_key; static void br_set_lockdep_class(struct net_device *dev) { lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key); } static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); int err; dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; err = br_fdb_hash_init(br); if (err) { free_percpu(dev->tstats); return err; } err = br_mdb_hash_init(br); if (err) { free_percpu(dev->tstats); br_fdb_hash_fini(br); return err; } err = br_vlan_init(br); if (err) { free_percpu(dev->tstats); br_mdb_hash_fini(br); br_fdb_hash_fini(br); return err; } err = br_multicast_init_stats(br); if (err) { free_percpu(dev->tstats); br_vlan_flush(br); br_mdb_hash_fini(br); br_fdb_hash_fini(br); } br_set_lockdep_class(dev); return err; } static void br_dev_uninit(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); br_multicast_dev_del(br); br_multicast_uninit_stats(br); br_vlan_flush(br); br_mdb_hash_fini(br); br_fdb_hash_fini(br); free_percpu(dev->tstats); } static int br_dev_open(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); netdev_update_features(dev); netif_start_queue(dev); br_stp_enable_bridge(br); br_multicast_open(br); if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) br_multicast_join_snoopers(br); return 0; } static void br_dev_set_multicast_list(struct net_device *dev) { } static void br_dev_change_rx_flags(struct net_device *dev, int change) { if (change & IFF_PROMISC) br_manage_promisc(netdev_priv(dev)); } static int br_dev_stop(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); br_stp_disable_bridge(br); br_multicast_stop(br); if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) br_multicast_leave_snoopers(br); netif_stop_queue(dev); return 0; } static int br_change_mtu(struct net_device *dev, int new_mtu) { struct net_bridge *br = netdev_priv(dev); dev->mtu = new_mtu; /* this flag will be cleared if the MTU was automatically adjusted */ br_opt_toggle(br, BROPT_MTU_SET_BY_USER, true); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* remember the MTU in the rtable for PMTU */ dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu); #endif return 0; } /* Allow setting mac address to any valid ethernet address. */ static int br_set_mac_address(struct net_device *dev, void *p) { struct net_bridge *br = netdev_priv(dev); struct sockaddr *addr = p; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; /* dev_set_mac_addr() can be called by a master device on bridge's * NETDEV_UNREGISTER, but since it's being destroyed do nothing */ if (dev->reg_state != NETREG_REGISTERED) return -EBUSY; spin_lock_bh(&br->lock); if (!ether_addr_equal(dev->dev_addr, addr->sa_data)) { /* Mac address will be changed in br_stp_change_bridge_id(). */ br_stp_change_bridge_id(br, addr->sa_data); } spin_unlock_bh(&br->lock); return 0; } static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info) { strscpy(info->driver, "bridge", sizeof(info->driver)); strscpy(info->version, BR_VERSION, sizeof(info->version)); strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); strscpy(info->bus_info, "N/A", sizeof(info->bus_info)); } static int br_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p; cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; cmd->base.speed = SPEED_UNKNOWN; list_for_each_entry(p, &br->port_list, list) { struct ethtool_link_ksettings ecmd; struct net_device *pdev = p->dev; if (!netif_running(pdev) || !netif_oper_up(pdev)) continue; if (__ethtool_get_link_ksettings(pdev, &ecmd)) continue; if (ecmd.base.speed == (__u32)SPEED_UNKNOWN) continue; if (cmd->base.speed == (__u32)SPEED_UNKNOWN || cmd->base.speed < ecmd.base.speed) cmd->base.speed = ecmd.base.speed; } return 0; } static netdev_features_t br_fix_features(struct net_device *dev, netdev_features_t features) { struct net_bridge *br = netdev_priv(dev); return br_features_recompute(br, features); } #ifdef CONFIG_NET_POLL_CONTROLLER static void br_poll_controller(struct net_device *br_dev) { } static void br_netpoll_cleanup(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) br_netpoll_disable(p); } static int __br_netpoll_enable(struct net_bridge_port *p) { struct netpoll *np; int err; np = kzalloc(sizeof(*p->np), GFP_KERNEL); if (!np) return -ENOMEM; err = __netpoll_setup(np, p->dev); if (err) { kfree(np); return err; } p->np = np; return err; } int br_netpoll_enable(struct net_bridge_port *p) { if (!p->br->dev->npinfo) return 0; return __br_netpoll_enable(p); } static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p; int err = 0; list_for_each_entry(p, &br->port_list, list) { if (!p->dev) continue; err = __br_netpoll_enable(p); if (err) goto fail; } out: return err; fail: br_netpoll_cleanup(dev); goto out; } void br_netpoll_disable(struct net_bridge_port *p) { struct netpoll *np = p->np; if (!np) return; p->np = NULL; __netpoll_free(np); } #endif static int br_add_slave(struct net_device *dev, struct net_device *slave_dev, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); return br_add_if(br, slave_dev, extack); } static int br_del_slave(struct net_device *dev, struct net_device *slave_dev) { struct net_bridge *br = netdev_priv(dev); return br_del_if(br, slave_dev); } static int br_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct net_bridge_fdb_entry *f; struct net_bridge_port *dst; struct net_bridge *br; if (netif_is_bridge_port(ctx->dev)) return -1; br = netdev_priv(ctx->dev); br_vlan_fill_forward_path_pvid(br, ctx, path); f = br_fdb_find_rcu(br, ctx->daddr, path->bridge.vlan_id); if (!f || !f->dst) return -1; dst = READ_ONCE(f->dst); if (!dst) return -1; if (br_vlan_fill_forward_path_mode(br, dst, path)) return -1; path->type = DEV_PATH_BRIDGE; path->dev = dst->br->dev; ctx->dev = dst->dev; switch (path->bridge.vlan_mode) { case DEV_PATH_BR_VLAN_TAG: if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan)) return -ENOSPC; ctx->vlan[ctx->num_vlans].id = path->bridge.vlan_id; ctx->vlan[ctx->num_vlans].proto = path->bridge.vlan_proto; ctx->num_vlans++; break; case DEV_PATH_BR_VLAN_UNTAG_HW: case DEV_PATH_BR_VLAN_UNTAG: ctx->num_vlans--; break; case DEV_PATH_BR_VLAN_KEEP: break; } return 0; } static const struct ethtool_ops br_ethtool_ops = { .get_drvinfo = br_getinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = br_get_link_ksettings, }; static const struct net_device_ops br_netdev_ops = { .ndo_open = br_dev_open, .ndo_stop = br_dev_stop, .ndo_init = br_dev_init, .ndo_uninit = br_dev_uninit, .ndo_start_xmit = br_dev_xmit, .ndo_get_stats64 = dev_get_tstats64, .ndo_set_mac_address = br_set_mac_address, .ndo_set_rx_mode = br_dev_set_multicast_list, .ndo_change_rx_flags = br_dev_change_rx_flags, .ndo_change_mtu = br_change_mtu, .ndo_siocdevprivate = br_dev_siocdevprivate, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = br_netpoll_setup, .ndo_netpoll_cleanup = br_netpoll_cleanup, .ndo_poll_controller = br_poll_controller, #endif .ndo_add_slave = br_add_slave, .ndo_del_slave = br_del_slave, .ndo_fix_features = br_fix_features, .ndo_fdb_add = br_fdb_add, .ndo_fdb_del = br_fdb_delete, .ndo_fdb_del_bulk = br_fdb_delete_bulk, .ndo_fdb_dump = br_fdb_dump, .ndo_fdb_get = br_fdb_get, .ndo_mdb_add = br_mdb_add, .ndo_mdb_del = br_mdb_del, .ndo_mdb_dump = br_mdb_dump, .ndo_bridge_getlink = br_getlink, .ndo_bridge_setlink = br_setlink, .ndo_bridge_dellink = br_dellink, .ndo_features_check = passthru_features_check, .ndo_fill_forward_path = br_fill_forward_path, }; static struct device_type br_type = { .name = "bridge", }; void br_dev_setup(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); eth_hw_addr_random(dev); ether_setup(dev); dev->netdev_ops = &br_netdev_ops; dev->needs_free_netdev = true; dev->ethtool_ops = &br_ethtool_ops; SET_NETDEV_DEVTYPE(dev, &br_type); dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE; dev->features = COMMON_FEATURES | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; dev->vlan_features = COMMON_FEATURES; br->dev = dev; spin_lock_init(&br->lock); INIT_LIST_HEAD(&br->port_list); INIT_HLIST_HEAD(&br->fdb_list); INIT_HLIST_HEAD(&br->frame_type_list); #if IS_ENABLED(CONFIG_BRIDGE_MRP) INIT_HLIST_HEAD(&br->mrp_list); #endif #if IS_ENABLED(CONFIG_BRIDGE_CFM) INIT_HLIST_HEAD(&br->mep_list); #endif spin_lock_init(&br->hash_lock); br->bridge_id.prio[0] = 0x80; br->bridge_id.prio[1] = 0x00; ether_addr_copy(br->group_addr, eth_stp_addr); br->stp_enabled = BR_NO_STP; br->group_fwd_mask = BR_GROUPFWD_DEFAULT; br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT; br->designated_root = br->bridge_id; br->bridge_max_age = br->max_age = 20 * HZ; br->bridge_hello_time = br->hello_time = 2 * HZ; br->bridge_forward_delay = br->forward_delay = 15 * HZ; br->bridge_ageing_time = br->ageing_time = BR_DEFAULT_AGEING_TIME; dev->max_mtu = ETH_MAX_MTU; br_netfilter_rtable_init(br); br_stp_timer_init(br); br_multicast_init(br); INIT_DELAYED_WORK(&br->gc_work, br_fdb_cleanup); }
1702 1703 1703 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X_TABLES_H #define _X_TABLES_H #include <linux/netdevice.h> #include <linux/static_key.h> #include <linux/netfilter.h> #include <uapi/linux/netfilter/x_tables.h> /* Test a struct->invflags and a boolean for inequality */ #define NF_INVF(ptr, flag, boolean) \ ((boolean) ^ !!((ptr)->invflags & (flag))) /** * struct xt_action_param - parameters for matches/targets * * @match: the match extension * @target: the target extension * @matchinfo: per-match data * @targetinfo: per-target data * @state: pointer to hook state this packet came from * @fragoff: packet is a fragment, this is the data offset * @thoff: position of transport header relative to skb->data * * Fields written to by extensions: * * @hotdrop: drop packet if we had inspection problems */ struct xt_action_param { union { const struct xt_match *match; const struct xt_target *target; }; union { const void *matchinfo, *targinfo; }; const struct nf_hook_state *state; unsigned int thoff; u16 fragoff; bool hotdrop; }; static inline struct net *xt_net(const struct xt_action_param *par) { return par->state->net; } static inline struct net_device *xt_in(const struct xt_action_param *par) { return par->state->in; } static inline const char *xt_inname(const struct xt_action_param *par) { return par->state->in->name; } static inline struct net_device *xt_out(const struct xt_action_param *par) { return par->state->out; } static inline const char *xt_outname(const struct xt_action_param *par) { return par->state->out->name; } static inline unsigned int xt_hooknum(const struct xt_action_param *par) { return par->state->hook; } static inline u_int8_t xt_family(const struct xt_action_param *par) { return par->state->pf; } /** * struct xt_mtchk_param - parameters for match extensions' * checkentry functions * * @net: network namespace through which the check was invoked * @table: table the rule is tried to be inserted into * @entryinfo: the family-specific rule data * (struct ipt_ip, ip6t_ip, arpt_arp or (note) ebt_entry) * @match: struct xt_match through which this function was invoked * @matchinfo: per-match data * @hook_mask: via which hooks the new rule is reachable * Other fields as above. */ struct xt_mtchk_param { struct net *net; const char *table; const void *entryinfo; const struct xt_match *match; void *matchinfo; unsigned int hook_mask; u_int8_t family; bool nft_compat; }; /** * struct xt_mdtor_param - match destructor parameters * Fields as above. */ struct xt_mtdtor_param { struct net *net; const struct xt_match *match; void *matchinfo; u_int8_t family; }; /** * struct xt_tgchk_param - parameters for target extensions' * checkentry functions * * @entryinfo: the family-specific rule data * (struct ipt_entry, ip6t_entry, arpt_entry, ebt_entry) * * Other fields see above. */ struct xt_tgchk_param { struct net *net; const char *table; const void *entryinfo; const struct xt_target *target; void *targinfo; unsigned int hook_mask; u_int8_t family; bool nft_compat; }; /* Target destructor parameters */ struct xt_tgdtor_param { struct net *net; const struct xt_target *target; void *targinfo; u_int8_t family; }; struct xt_match { struct list_head list; const char name[XT_EXTENSION_MAXNAMELEN]; u_int8_t revision; /* Return true or false: return FALSE and set *hotdrop = 1 to force immediate packet drop. */ /* Arguments changed since 2.6.9, as this must now handle non-linear skb, using skb_header_pointer and skb_ip_make_writable. */ bool (*match)(const struct sk_buff *skb, struct xt_action_param *); /* Called when user tries to insert an entry of this type. */ int (*checkentry)(const struct xt_mtchk_param *); /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_mtdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, const void *src); int (*compat_to_user)(void __user *dst, const void *src); #endif /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; const char *table; unsigned int matchsize; unsigned int usersize; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT unsigned int compatsize; #endif unsigned int hooks; unsigned short proto; unsigned short family; }; /* Registration hooks for targets. */ struct xt_target { struct list_head list; const char name[XT_EXTENSION_MAXNAMELEN]; u_int8_t revision; /* Returns verdict. Argument order changed since 2.6.9, as this must now handle non-linear skbs, using skb_copy_bits and skb_ip_make_writable. */ unsigned int (*target)(struct sk_buff *skb, const struct xt_action_param *); /* Called when user tries to insert an entry of this type: hook_mask is a bitmask of hooks from which it can be called. */ /* Should return 0 on success or an error code otherwise (-Exxxx). */ int (*checkentry)(const struct xt_tgchk_param *); /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_tgdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, const void *src); int (*compat_to_user)(void __user *dst, const void *src); #endif /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; const char *table; unsigned int targetsize; unsigned int usersize; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT unsigned int compatsize; #endif unsigned int hooks; unsigned short proto; unsigned short family; }; /* Furniture shopping... */ struct xt_table { struct list_head list; /* What hooks you will enter on */ unsigned int valid_hooks; /* Man behind the curtain... */ struct xt_table_info *private; /* hook ops that register the table with the netfilter core */ struct nf_hook_ops *ops; /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; u_int8_t af; /* address/protocol family */ int priority; /* hook order */ /* A unique name... */ const char name[XT_TABLE_MAXNAMELEN]; }; #include <linux/netfilter_ipv4.h> /* The table itself */ struct xt_table_info { /* Size per table */ unsigned int size; /* Number of entries: FIXME. --RR */ unsigned int number; /* Initial number of entries. Needed for module usage count */ unsigned int initial_entries; /* Entry points and underflows */ unsigned int hook_entry[NF_INET_NUMHOOKS]; unsigned int underflow[NF_INET_NUMHOOKS]; /* * Number of user chains. Since tables cannot have loops, at most * @stacksize jumps (number of user chains) can possibly be made. */ unsigned int stacksize; void ***jumpstack; unsigned char entries[] __aligned(8); }; int xt_register_target(struct xt_target *target); void xt_unregister_target(struct xt_target *target); int xt_register_targets(struct xt_target *target, unsigned int n); void xt_unregister_targets(struct xt_target *target, unsigned int n); int xt_register_match(struct xt_match *target); void xt_unregister_match(struct xt_match *target); int xt_register_matches(struct xt_match *match, unsigned int n); void xt_unregister_matches(struct xt_match *match, unsigned int n); int xt_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset); int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks); unsigned int *xt_alloc_entry_offsets(unsigned int size); bool xt_find_jump_offset(const unsigned int *offsets, unsigned int target, unsigned int size); int xt_check_proc_name(const char *name, unsigned int size); int xt_check_match(struct xt_mtchk_param *, unsigned int size, u16 proto, bool inv_proto); int xt_check_target(struct xt_tgchk_param *, unsigned int size, u16 proto, bool inv_proto); int xt_match_to_user(const struct xt_entry_match *m, struct xt_entry_match __user *u); int xt_target_to_user(const struct xt_entry_target *t, struct xt_entry_target __user *u); int xt_data_to_user(void __user *dst, const void *src, int usersize, int size, int aligned_size); void *xt_copy_counters(sockptr_t arg, unsigned int len, struct xt_counters_info *info); struct xt_counters *xt_counters_alloc(unsigned int counters); struct xt_table *xt_register_table(struct net *net, const struct xt_table *table, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); void *xt_unregister_table(struct xt_table *table); struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, struct xt_table_info *newinfo, int *error); struct xt_match *xt_find_match(u8 af, const char *name, u8 revision); struct xt_match *xt_request_find_match(u8 af, const char *name, u8 revision); struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision); int xt_find_revision(u8 af, const char *name, u8 revision, int target, int *err); struct xt_table *xt_find_table(struct net *net, u8 af, const char *name); struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name); struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af, const char *name); void xt_table_unlock(struct xt_table *t); int xt_proto_init(struct net *net, u_int8_t af); void xt_proto_fini(struct net *net, u_int8_t af); struct xt_table_info *xt_alloc_table_info(unsigned int size); void xt_free_table_info(struct xt_table_info *info); /** * xt_recseq - recursive seqcount for netfilter use * * Packet processing changes the seqcount only if no recursion happened * get_counters() can use read_seqcount_begin()/read_seqcount_retry(), * because we use the normal seqcount convention : * Low order bit set to 1 if a writer is active. */ DECLARE_PER_CPU(seqcount_t, xt_recseq); /* xt_tee_enabled - true if x_tables needs to handle reentrancy * * Enabled if current ip(6)tables ruleset has at least one -j TEE rule. */ extern struct static_key xt_tee_enabled; /** * xt_write_recseq_begin - start of a write section * * Begin packet processing : all readers must wait the end * 1) Must be called with preemption disabled * 2) softirqs must be disabled too (or we should use this_cpu_add()) * Returns : * 1 if no recursion on this cpu * 0 if recursion detected */ static inline unsigned int xt_write_recseq_begin(void) { unsigned int addend; /* * Low order bit of sequence is set if we already * called xt_write_recseq_begin(). */ addend = (__this_cpu_read(xt_recseq.sequence) + 1) & 1; /* * This is kind of a write_seqcount_begin(), but addend is 0 or 1 * We dont check addend value to avoid a test and conditional jump, * since addend is most likely 1 */ __this_cpu_add(xt_recseq.sequence, addend); smp_mb(); return addend; } /** * xt_write_recseq_end - end of a write section * @addend: return value from previous xt_write_recseq_begin() * * End packet processing : all readers can proceed * 1) Must be called with preemption disabled * 2) softirqs must be disabled too (or we should use this_cpu_add()) */ static inline void xt_write_recseq_end(unsigned int addend) { /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */ smp_wmb(); __this_cpu_add(xt_recseq.sequence, addend); } /* * This helper is performance critical and must be inlined */ static inline unsigned long ifname_compare_aligned(const char *_a, const char *_b, const char *_mask) { const unsigned long *a = (const unsigned long *)_a; const unsigned long *b = (const unsigned long *)_b; const unsigned long *mask = (const unsigned long *)_mask; unsigned long ret; ret = (a[0] ^ b[0]) & mask[0]; if (IFNAMSIZ > sizeof(unsigned long)) ret |= (a[1] ^ b[1]) & mask[1]; if (IFNAMSIZ > 2 * sizeof(unsigned long)) ret |= (a[2] ^ b[2]) & mask[2]; if (IFNAMSIZ > 3 * sizeof(unsigned long)) ret |= (a[3] ^ b[3]) & mask[3]; BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long)); return ret; } struct xt_percpu_counter_alloc_state { unsigned int off; const char __percpu *mem; }; bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state, struct xt_counters *counter); void xt_percpu_counter_free(struct xt_counters *cnt); static inline struct xt_counters * xt_get_this_cpu_counter(struct xt_counters *cnt) { if (nr_cpu_ids > 1) return this_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt); return cnt; } static inline struct xt_counters * xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu) { if (nr_cpu_ids > 1) return per_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt, cpu); return cnt; } struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *); int xt_register_template(const struct xt_table *t, int(*table_init)(struct net *net)); void xt_unregister_template(const struct xt_table *t); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include <net/compat.h> struct compat_xt_entry_match { union { struct { u_int16_t match_size; char name[XT_FUNCTION_MAXNAMELEN - 1]; u_int8_t revision; } user; struct { u_int16_t match_size; compat_uptr_t match; } kernel; u_int16_t match_size; } u; unsigned char data[]; }; struct compat_xt_entry_target { union { struct { u_int16_t target_size; char name[XT_FUNCTION_MAXNAMELEN - 1]; u_int8_t revision; } user; struct { u_int16_t target_size; compat_uptr_t target; } kernel; u_int16_t target_size; } u; unsigned char data[]; }; /* FIXME: this works only on 32 bit tasks * need to change whole approach in order to calculate align as function of * current task alignment */ struct compat_xt_counters { compat_u64 pcnt, bcnt; /* Packet and byte counters */ }; struct compat_xt_counters_info { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t num_counters; struct compat_xt_counters counters[]; }; struct _compat_xt_align { __u8 u8; __u16 u16; __u32 u32; compat_u64 u64; }; #define COMPAT_XT_ALIGN(s) __ALIGN_KERNEL((s), __alignof__(struct _compat_xt_align)) void xt_compat_lock(u_int8_t af); void xt_compat_unlock(u_int8_t af); int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta); void xt_compat_flush_offsets(u_int8_t af); int xt_compat_init_offsets(u8 af, unsigned int number); int xt_compat_calc_jump(u_int8_t af, unsigned int offset); int xt_compat_match_offset(const struct xt_match *match); void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, unsigned int *size); int xt_compat_match_to_user(const struct xt_entry_match *m, void __user **dstptr, unsigned int *size); int xt_compat_target_offset(const struct xt_target *target); void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, unsigned int *size); int xt_compat_target_to_user(const struct xt_entry_target *t, void __user **dstptr, unsigned int *size); int xt_compat_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset); #endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ #endif /* _X_TABLES_H */
72 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 // SPDX-License-Identifier: GPL-2.0-or-later /* SHA-512 code by Jean-Luc Cooke <jlcooke@certainkey.com> * * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) 2003 Kyle McMartin <kyle@debian.org> */ #include <crypto/internal/hash.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/crypto.h> #include <linux/types.h> #include <crypto/sha2.h> #include <crypto/sha512_base.h> #include <linux/percpu.h> #include <asm/byteorder.h> #include <asm/unaligned.h> const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE] = { 0x38, 0xb0, 0x60, 0xa7, 0x51, 0xac, 0x96, 0x38, 0x4c, 0xd9, 0x32, 0x7e, 0xb1, 0xb1, 0xe3, 0x6a, 0x21, 0xfd, 0xb7, 0x11, 0x14, 0xbe, 0x07, 0x43, 0x4c, 0x0c, 0xc7, 0xbf, 0x63, 0xf6, 0xe1, 0xda, 0x27, 0x4e, 0xde, 0xbf, 0xe7, 0x6f, 0x65, 0xfb, 0xd5, 0x1a, 0xd2, 0xf1, 0x48, 0x98, 0xb9, 0x5b }; EXPORT_SYMBOL_GPL(sha384_zero_message_hash); const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE] = { 0xcf, 0x83, 0xe1, 0x35, 0x7e, 0xef, 0xb8, 0xbd, 0xf1, 0x54, 0x28, 0x50, 0xd6, 0x6d, 0x80, 0x07, 0xd6, 0x20, 0xe4, 0x05, 0x0b, 0x57, 0x15, 0xdc, 0x83, 0xf4, 0xa9, 0x21, 0xd3, 0x6c, 0xe9, 0xce, 0x47, 0xd0, 0xd1, 0x3c, 0x5d, 0x85, 0xf2, 0xb0, 0xff, 0x83, 0x18, 0xd2, 0x87, 0x7e, 0xec, 0x2f, 0x63, 0xb9, 0x31, 0xbd, 0x47, 0x41, 0x7a, 0x81, 0xa5, 0x38, 0x32, 0x7a, 0xf9, 0x27, 0xda, 0x3e }; EXPORT_SYMBOL_GPL(sha512_zero_message_hash); static inline u64 Ch(u64 x, u64 y, u64 z) { return z ^ (x & (y ^ z)); } static inline u64 Maj(u64 x, u64 y, u64 z) { return (x & y) | (z & (x | y)); } static const u64 sha512_K[80] = { 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL, }; #define e0(x) (ror64(x,28) ^ ror64(x,34) ^ ror64(x,39)) #define e1(x) (ror64(x,14) ^ ror64(x,18) ^ ror64(x,41)) #define s0(x) (ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7)) #define s1(x) (ror64(x,19) ^ ror64(x,61) ^ (x >> 6)) static inline void LOAD_OP(int I, u64 *W, const u8 *input) { W[I] = get_unaligned_be64((__u64 *)input + I); } static inline void BLEND_OP(int I, u64 *W) { W[I & 15] += s1(W[(I-2) & 15]) + W[(I-7) & 15] + s0(W[(I-15) & 15]); } static void sha512_transform(u64 *state, const u8 *input) { u64 a, b, c, d, e, f, g, h, t1, t2; int i; u64 W[16]; /* load the state into our registers */ a=state[0]; b=state[1]; c=state[2]; d=state[3]; e=state[4]; f=state[5]; g=state[6]; h=state[7]; /* now iterate */ for (i=0; i<80; i+=8) { if (!(i & 8)) { int j; if (i < 16) { /* load the input */ for (j = 0; j < 16; j++) LOAD_OP(i + j, W, input); } else { for (j = 0; j < 16; j++) { BLEND_OP(i + j, W); } } } t1 = h + e1(e) + Ch(e,f,g) + sha512_K[i ] + W[(i & 15)]; t2 = e0(a) + Maj(a,b,c); d+=t1; h=t1+t2; t1 = g + e1(d) + Ch(d,e,f) + sha512_K[i+1] + W[(i & 15) + 1]; t2 = e0(h) + Maj(h,a,b); c+=t1; g=t1+t2; t1 = f + e1(c) + Ch(c,d,e) + sha512_K[i+2] + W[(i & 15) + 2]; t2 = e0(g) + Maj(g,h,a); b+=t1; f=t1+t2; t1 = e + e1(b) + Ch(b,c,d) + sha512_K[i+3] + W[(i & 15) + 3]; t2 = e0(f) + Maj(f,g,h); a+=t1; e=t1+t2; t1 = d + e1(a) + Ch(a,b,c) + sha512_K[i+4] + W[(i & 15) + 4]; t2 = e0(e) + Maj(e,f,g); h+=t1; d=t1+t2; t1 = c + e1(h) + Ch(h,a,b) + sha512_K[i+5] + W[(i & 15) + 5]; t2 = e0(d) + Maj(d,e,f); g+=t1; c=t1+t2; t1 = b + e1(g) + Ch(g,h,a) + sha512_K[i+6] + W[(i & 15) + 6]; t2 = e0(c) + Maj(c,d,e); f+=t1; b=t1+t2; t1 = a + e1(f) + Ch(f,g,h) + sha512_K[i+7] + W[(i & 15) + 7]; t2 = e0(b) + Maj(b,c,d); e+=t1; a=t1+t2; } state[0] += a; state[1] += b; state[2] += c; state[3] += d; state[4] += e; state[5] += f; state[6] += g; state[7] += h; } static void sha512_generic_block_fn(struct sha512_state *sst, u8 const *src, int blocks) { while (blocks--) { sha512_transform(sst->state, src); src += SHA512_BLOCK_SIZE; } } int crypto_sha512_update(struct shash_desc *desc, const u8 *data, unsigned int len) { return sha512_base_do_update(desc, data, len, sha512_generic_block_fn); } EXPORT_SYMBOL(crypto_sha512_update); static int sha512_final(struct shash_desc *desc, u8 *hash) { sha512_base_do_finalize(desc, sha512_generic_block_fn); return sha512_base_finish(desc, hash); } int crypto_sha512_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *hash) { sha512_base_do_update(desc, data, len, sha512_generic_block_fn); return sha512_final(desc, hash); } EXPORT_SYMBOL(crypto_sha512_finup); static struct shash_alg sha512_algs[2] = { { .digestsize = SHA512_DIGEST_SIZE, .init = sha512_base_init, .update = crypto_sha512_update, .final = sha512_final, .finup = crypto_sha512_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-generic", .cra_priority = 100, .cra_blocksize = SHA512_BLOCK_SIZE, .cra_module = THIS_MODULE, } }, { .digestsize = SHA384_DIGEST_SIZE, .init = sha384_base_init, .update = crypto_sha512_update, .final = sha512_final, .finup = crypto_sha512_finup, .descsize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-generic", .cra_priority = 100, .cra_blocksize = SHA384_BLOCK_SIZE, .cra_module = THIS_MODULE, } } }; static int __init sha512_generic_mod_init(void) { return crypto_register_shashes(sha512_algs, ARRAY_SIZE(sha512_algs)); } static void __exit sha512_generic_mod_fini(void) { crypto_unregister_shashes(sha512_algs, ARRAY_SIZE(sha512_algs)); } subsys_initcall(sha512_generic_mod_init); module_exit(sha512_generic_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SHA-512 and SHA-384 Secure Hash Algorithms"); MODULE_ALIAS_CRYPTO("sha384"); MODULE_ALIAS_CRYPTO("sha384-generic"); MODULE_ALIAS_CRYPTO("sha512"); MODULE_ALIAS_CRYPTO("sha512-generic");
2470 9898 1064 9899 11018 4657 1062 1063 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Sleepable Read-Copy Update mechanism for mutual exclusion * * Copyright (C) IBM Corporation, 2006 * Copyright (C) Fujitsu, 2012 * * Author: Paul McKenney <paulmck@linux.ibm.com> * Lai Jiangshan <laijs@cn.fujitsu.com> * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU/ *.txt * */ #ifndef _LINUX_SRCU_H #define _LINUX_SRCU_H #include <linux/mutex.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/rcu_segcblist.h> struct srcu_struct; #ifdef CONFIG_DEBUG_LOCK_ALLOC int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key); #define init_srcu_struct(ssp) \ ({ \ static struct lock_class_key __srcu_key; \ \ __init_srcu_struct((ssp), #ssp, &__srcu_key); \ }) #define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name }, #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ int init_srcu_struct(struct srcu_struct *ssp); #define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_TINY_SRCU #include <linux/srcutiny.h> #elif defined(CONFIG_TREE_SRCU) #include <linux/srcutree.h> #else #error "Unknown SRCU implementation specified to kernel configuration" #endif void call_srcu(struct srcu_struct *ssp, struct rcu_head *head, void (*func)(struct rcu_head *head)); void cleanup_srcu_struct(struct srcu_struct *ssp); int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); void synchronize_srcu(struct srcu_struct *ssp); unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp); unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); #ifdef CONFIG_NEED_SRCU_NMI_SAFE int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp); #else static inline int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) { return __srcu_read_lock(ssp); } static inline void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) { __srcu_read_unlock(ssp, idx); } #endif /* CONFIG_NEED_SRCU_NMI_SAFE */ void srcu_init(void); #ifdef CONFIG_DEBUG_LOCK_ALLOC /** * srcu_read_lock_held - might we be in SRCU read-side critical section? * @ssp: The srcu_struct structure to check * * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, * this assumes we are in an SRCU read-side critical section unless it can * prove otherwise. * * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. * * Note that SRCU is based on its own statemachine and it doesn't * relies on normal RCU, it can be called from the CPU which * is in the idle loop from an RCU point of view or offline. */ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) { if (!debug_lockdep_rcu_enabled()) return 1; return lock_is_held(&ssp->dep_map); } /* * Annotations provide deadlock detection for SRCU. * * Similar to other lockdep annotations, except there is an additional * srcu_lock_sync(), which is basically an empty *write*-side critical section, * see lock_sync() for more information. */ /* Annotates a srcu_read_lock() */ static inline void srcu_lock_acquire(struct lockdep_map *map) { lock_map_acquire_read(map); } /* Annotates a srcu_read_lock() */ static inline void srcu_lock_release(struct lockdep_map *map) { lock_map_release(map); } /* Annotates a synchronize_srcu() */ static inline void srcu_lock_sync(struct lockdep_map *map) { lock_map_sync(map); } #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ static inline int srcu_read_lock_held(const struct srcu_struct *ssp) { return 1; } #define srcu_lock_acquire(m) do { } while (0) #define srcu_lock_release(m) do { } while (0) #define srcu_lock_sync(m) do { } while (0) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #define SRCU_NMI_UNKNOWN 0x0 #define SRCU_NMI_UNSAFE 0x1 #define SRCU_NMI_SAFE 0x2 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TREE_SRCU) void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe); #else static inline void srcu_check_nmi_safety(struct srcu_struct *ssp, bool nmi_safe) { } #endif /** * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing * @p: the pointer to fetch and protect for later dereferencing * @ssp: pointer to the srcu_struct, which is used to check that we * really are in an SRCU read-side critical section. * @c: condition to check for update-side use * * If PROVE_RCU is enabled, invoking this outside of an RCU read-side * critical section will result in an RCU-lockdep splat, unless @c evaluates * to 1. The @c argument will normally be a logical expression containing * lockdep_is_held() calls. */ #define srcu_dereference_check(p, ssp, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || srcu_read_lock_held(ssp), __rcu) /** * srcu_dereference - fetch SRCU-protected pointer for later dereferencing * @p: the pointer to fetch and protect for later dereferencing * @ssp: pointer to the srcu_struct, which is used to check that we * really are in an SRCU read-side critical section. * * Makes rcu_dereference_check() do the dirty work. If PROVE_RCU * is enabled, invoking this outside of an RCU read-side critical * section will result in an RCU-lockdep splat. */ #define srcu_dereference(p, ssp) srcu_dereference_check((p), (ssp), 0) /** * srcu_dereference_notrace - no tracing and no lockdep calls from here * @p: the pointer to fetch and protect for later dereferencing * @ssp: pointer to the srcu_struct, which is used to check that we * really are in an SRCU read-side critical section. */ #define srcu_dereference_notrace(p, ssp) srcu_dereference_check((p), (ssp), 1) /** * srcu_read_lock - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section. Note that SRCU read-side * critical sections may be nested. However, it is illegal to * call anything that waits on an SRCU grace period for the same * srcu_struct, whether directly or indirectly. Please note that * one way to indirectly wait on an SRCU grace period is to acquire * a mutex that is held elsewhere while calling synchronize_srcu() or * synchronize_srcu_expedited(). * * Note that srcu_read_lock() and the matching srcu_read_unlock() must * occur in the same context, for example, it is illegal to invoke * srcu_read_unlock() in an irq handler if the matching srcu_read_lock() * was invoked in process context. */ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) { int retval; srcu_check_nmi_safety(ssp, false); retval = __srcu_read_lock(ssp); srcu_lock_acquire(&ssp->dep_map); return retval; } /** * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section, but in an NMI-safe manner. * See srcu_read_lock() for more information. */ static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp) { int retval; srcu_check_nmi_safety(ssp, true); retval = __srcu_read_lock_nmisafe(ssp); rcu_lock_acquire(&ssp->dep_map); return retval; } /* Used by tracing, cannot be traced and cannot invoke lockdep. */ static inline notrace int srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp) { int retval; srcu_check_nmi_safety(ssp, false); retval = __srcu_read_lock(ssp); return retval; } /** * srcu_down_read - register a new reader for an SRCU-protected structure. * @ssp: srcu_struct in which to register the new reader. * * Enter a semaphore-like SRCU read-side critical section. Note that * SRCU read-side critical sections may be nested. However, it is * illegal to call anything that waits on an SRCU grace period for the * same srcu_struct, whether directly or indirectly. Please note that * one way to indirectly wait on an SRCU grace period is to acquire * a mutex that is held elsewhere while calling synchronize_srcu() or * synchronize_srcu_expedited(). But if you want lockdep to help you * keep this stuff straight, you should instead use srcu_read_lock(). * * The semaphore-like nature of srcu_down_read() means that the matching * srcu_up_read() can be invoked from some other context, for example, * from some other task or from an irq handler. However, neither * srcu_down_read() nor srcu_up_read() may be invoked from an NMI handler. * * Calls to srcu_down_read() may be nested, similar to the manner in * which calls to down_read() may be nested. */ static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp) { WARN_ON_ONCE(in_nmi()); srcu_check_nmi_safety(ssp, false); return __srcu_read_lock(ssp); } /** * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. * @idx: return value from corresponding srcu_read_lock(). * * Exit an SRCU read-side critical section. */ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); srcu_check_nmi_safety(ssp, false); srcu_lock_release(&ssp->dep_map); __srcu_read_unlock(ssp, idx); } /** * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. * @idx: return value from corresponding srcu_read_lock(). * * Exit an SRCU read-side critical section, but in an NMI-safe manner. */ static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); srcu_check_nmi_safety(ssp, true); rcu_lock_release(&ssp->dep_map); __srcu_read_unlock_nmisafe(ssp, idx); } /* Used by tracing, cannot be traced and cannot call lockdep. */ static inline notrace void srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp) { srcu_check_nmi_safety(ssp, false); __srcu_read_unlock(ssp, idx); } /** * srcu_up_read - unregister a old reader from an SRCU-protected structure. * @ssp: srcu_struct in which to unregister the old reader. * @idx: return value from corresponding srcu_read_lock(). * * Exit an SRCU read-side critical section, but not necessarily from * the same context as the maching srcu_down_read(). */ static inline void srcu_up_read(struct srcu_struct *ssp, int idx) __releases(ssp) { WARN_ON_ONCE(idx & ~0x1); WARN_ON_ONCE(in_nmi()); srcu_check_nmi_safety(ssp, false); __srcu_read_unlock(ssp, idx); } /** * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock * * Converts the preceding srcu_read_unlock into a two-way memory barrier. * * Call this after srcu_read_unlock, to guarantee that all memory operations * that occur after smp_mb__after_srcu_read_unlock will appear to happen after * the preceding srcu_read_unlock. */ static inline void smp_mb__after_srcu_read_unlock(void) { /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */ } DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct, _T->idx = srcu_read_lock(_T->lock), srcu_read_unlock(_T->lock, _T->idx), int idx) #endif
1 145 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 /* SPDX-License-Identifier: GPL-2.0 */ /* include/net/dsfield.h - Manipulation of the Differentiated Services field */ /* Written 1998-2000 by Werner Almesberger, EPFL ICA */ #ifndef __NET_DSFIELD_H #define __NET_DSFIELD_H #include <linux/types.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <asm/byteorder.h> static inline __u8 ipv4_get_dsfield(const struct iphdr *iph) { return iph->tos; } static inline __u8 ipv6_get_dsfield(const struct ipv6hdr *ipv6h) { return ntohs(*(__force const __be16 *)ipv6h) >> 4; } static inline void ipv4_change_dsfield(struct iphdr *iph,__u8 mask, __u8 value) { __u32 check = ntohs((__force __be16)iph->check); __u8 dsfield; dsfield = (iph->tos & mask) | value; check += iph->tos; if ((check+1) >> 16) check = (check+1) & 0xffff; check -= dsfield; check += check >> 16; /* adjust carry */ iph->check = (__force __sum16)htons(check); iph->tos = dsfield; } static inline void ipv6_change_dsfield(struct ipv6hdr *ipv6h,__u8 mask, __u8 value) { __be16 *p = (__force __be16 *)ipv6h; *p = (*p & htons((((u16)mask << 4) | 0xf00f))) | htons((u16)value << 4); } #endif
551 551 551 551 551 551 550 551 166 281 306 306 289 165 178 269 269 266 4 165 165 552 553 126 553 254 255 255 255 255 255 98 255 98 255 98 171 171 171 171 171 1 1 1 1 1 255 171 255 255 97 79 214 210 210 214 144 144 311 308 5 5 5 5 5 5 5 582 569 557 11 6 6 6 171 2 2 2 2 165 167 159 93 93 93 93 1 93 237 70 167 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * Initialization/cleanup for SCTP protocol support. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/seq_file.h> #include <linux/memblock.h> #include <linux/highmem.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/protocol.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/route.h> #include <net/sctp/sctp.h> #include <net/addrconf.h> #include <net/inet_common.h> #include <net/inet_ecn.h> #include <net/udp_tunnel.h> #define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024) /* Global data structures. */ struct sctp_globals sctp_globals __read_mostly; struct idr sctp_assocs_id; DEFINE_SPINLOCK(sctp_assocs_id_lock); static struct sctp_pf *sctp_pf_inet6_specific; static struct sctp_pf *sctp_pf_inet_specific; static struct sctp_af *sctp_af_v4_specific; static struct sctp_af *sctp_af_v6_specific; struct kmem_cache *sctp_chunk_cachep __read_mostly; struct kmem_cache *sctp_bucket_cachep __read_mostly; long sysctl_sctp_mem[3]; int sysctl_sctp_rmem[3]; int sysctl_sctp_wmem[3]; /* Private helper to extract ipv4 address and stash them in * the protocol structure. */ static void sctp_v4_copy_addrlist(struct list_head *addrlist, struct net_device *dev) { struct in_device *in_dev; struct in_ifaddr *ifa; struct sctp_sockaddr_entry *addr; rcu_read_lock(); if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { rcu_read_unlock(); return; } in_dev_for_each_ifa_rcu(ifa, in_dev) { /* Add the address to the local list. */ addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v4.sin_family = AF_INET; addr->a.v4.sin_addr.s_addr = ifa->ifa_local; addr->valid = 1; INIT_LIST_HEAD(&addr->list); list_add_tail(&addr->list, addrlist); } } rcu_read_unlock(); } /* Extract our IP addresses from the system and stash them in the * protocol structure. */ static void sctp_get_local_addr_list(struct net *net) { struct net_device *dev; struct list_head *pos; struct sctp_af *af; rcu_read_lock(); for_each_netdev_rcu(net, dev) { list_for_each(pos, &sctp_address_families) { af = list_entry(pos, struct sctp_af, list); af->copy_addrlist(&net->sctp.local_addr_list, dev); } } rcu_read_unlock(); } /* Free the existing local addresses. */ static void sctp_free_local_addr_list(struct net *net) { struct sctp_sockaddr_entry *addr; struct list_head *pos, *temp; list_for_each_safe(pos, temp, &net->sctp.local_addr_list) { addr = list_entry(pos, struct sctp_sockaddr_entry, list); list_del(pos); kfree(addr); } } /* Copy the local addresses which are valid for 'scope' into 'bp'. */ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp, enum sctp_scope scope, gfp_t gfp, int copy_flags) { struct sctp_sockaddr_entry *addr; union sctp_addr laddr; int error = 0; rcu_read_lock(); list_for_each_entry_rcu(addr, &net->sctp.local_addr_list, list) { if (!addr->valid) continue; if (!sctp_in_scope(net, &addr->a, scope)) continue; /* Now that the address is in scope, check to see if * the address type is really supported by the local * sock as well as the remote peer. */ if (addr->a.sa.sa_family == AF_INET && (!(copy_flags & SCTP_ADDR4_ALLOWED) || !(copy_flags & SCTP_ADDR4_PEERSUPP))) continue; if (addr->a.sa.sa_family == AF_INET6 && (!(copy_flags & SCTP_ADDR6_ALLOWED) || !(copy_flags & SCTP_ADDR6_PEERSUPP))) continue; laddr = addr->a; /* also works for setting ipv6 address port */ laddr.v4.sin_port = htons(bp->port); if (sctp_bind_addr_state(bp, &laddr) != -1) continue; error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a), SCTP_ADDR_SRC, GFP_ATOMIC); if (error) break; } rcu_read_unlock(); return error; } /* Copy over any ip options */ static void sctp_v4_copy_ip_options(struct sock *sk, struct sock *newsk) { struct inet_sock *newinet, *inet = inet_sk(sk); struct ip_options_rcu *inet_opt, *newopt = NULL; newinet = inet_sk(newsk); rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { newopt = sock_kmalloc(newsk, sizeof(*inet_opt) + inet_opt->opt.optlen, GFP_ATOMIC); if (newopt) memcpy(newopt, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); else pr_err("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newinet->inet_opt, newopt); rcu_read_unlock(); } /* Account for the IP options */ static int sctp_v4_ip_options_len(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct ip_options_rcu *inet_opt; int len = 0; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) len = inet_opt->opt.optlen; rcu_read_unlock(); return len; } /* Initialize a sctp_addr from in incoming skb. */ static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb, int is_saddr) { /* Always called on head skb, so this is safe */ struct sctphdr *sh = sctp_hdr(skb); struct sockaddr_in *sa = &addr->v4; addr->v4.sin_family = AF_INET; if (is_saddr) { sa->sin_port = sh->source; sa->sin_addr.s_addr = ip_hdr(skb)->saddr; } else { sa->sin_port = sh->dest; sa->sin_addr.s_addr = ip_hdr(skb)->daddr; } memset(sa->sin_zero, 0, sizeof(sa->sin_zero)); } /* Initialize an sctp_addr from a socket. */ static void sctp_v4_from_sk(union sctp_addr *addr, struct sock *sk) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = 0; addr->v4.sin_addr.s_addr = inet_sk(sk)->inet_rcv_saddr; memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); } /* Initialize sk->sk_rcv_saddr from sctp_addr. */ static void sctp_v4_to_sk_saddr(union sctp_addr *addr, struct sock *sk) { inet_sk(sk)->inet_rcv_saddr = addr->v4.sin_addr.s_addr; } /* Initialize sk->sk_daddr from sctp_addr. */ static void sctp_v4_to_sk_daddr(union sctp_addr *addr, struct sock *sk) { inet_sk(sk)->inet_daddr = addr->v4.sin_addr.s_addr; } /* Initialize a sctp_addr from an address parameter. */ static bool sctp_v4_from_addr_param(union sctp_addr *addr, union sctp_addr_param *param, __be16 port, int iif) { if (ntohs(param->v4.param_hdr.length) < sizeof(struct sctp_ipv4addr_param)) return false; addr->v4.sin_family = AF_INET; addr->v4.sin_port = port; addr->v4.sin_addr.s_addr = param->v4.addr.s_addr; memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); return true; } /* Initialize an address parameter from a sctp_addr and return the length * of the address parameter. */ static int sctp_v4_to_addr_param(const union sctp_addr *addr, union sctp_addr_param *param) { int length = sizeof(struct sctp_ipv4addr_param); param->v4.param_hdr.type = SCTP_PARAM_IPV4_ADDRESS; param->v4.param_hdr.length = htons(length); param->v4.addr.s_addr = addr->v4.sin_addr.s_addr; return length; } /* Initialize a sctp_addr from a dst_entry. */ static void sctp_v4_dst_saddr(union sctp_addr *saddr, struct flowi4 *fl4, __be16 port) { saddr->v4.sin_family = AF_INET; saddr->v4.sin_port = port; saddr->v4.sin_addr.s_addr = fl4->saddr; memset(saddr->v4.sin_zero, 0, sizeof(saddr->v4.sin_zero)); } /* Compare two addresses exactly. */ static int sctp_v4_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2) { if (addr1->sa.sa_family != addr2->sa.sa_family) return 0; if (addr1->v4.sin_port != addr2->v4.sin_port) return 0; if (addr1->v4.sin_addr.s_addr != addr2->v4.sin_addr.s_addr) return 0; return 1; } /* Initialize addr struct to INADDR_ANY. */ static void sctp_v4_inaddr_any(union sctp_addr *addr, __be16 port) { addr->v4.sin_family = AF_INET; addr->v4.sin_addr.s_addr = htonl(INADDR_ANY); addr->v4.sin_port = port; memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); } /* Is this a wildcard address? */ static int sctp_v4_is_any(const union sctp_addr *addr) { return htonl(INADDR_ANY) == addr->v4.sin_addr.s_addr; } /* This function checks if the address is a valid address to be used for * SCTP binding. * * Output: * Return 0 - If the address is a non-unicast or an illegal address. * Return 1 - If the address is a unicast. */ static int sctp_v4_addr_valid(union sctp_addr *addr, struct sctp_sock *sp, const struct sk_buff *skb) { /* IPv4 addresses not allowed */ if (sp && ipv6_only_sock(sctp_opt2sk(sp))) return 0; /* Is this a non-unicast address or a unusable SCTP address? */ if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr)) return 0; /* Is this a broadcast address? */ if (skb && skb_rtable(skb)->rt_flags & RTCF_BROADCAST) return 0; return 1; } /* Should this be available for binding? */ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) { struct sock *sk = &sp->inet.sk; struct net *net = sock_net(sk); int tb_id = RT_TABLE_LOCAL; int ret; tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ?: tb_id; ret = inet_addr_type_table(net, addr->v4.sin_addr.s_addr, tb_id); if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) && ret != RTN_LOCAL && !inet_test_bit(FREEBIND, sk) && !READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind)) return 0; if (ipv6_only_sock(sctp_opt2sk(sp))) return 0; return 1; } /* Checking the loopback, private and other address scopes as defined in * RFC 1918. The IPv4 scoping is based on the draft for SCTP IPv4 * scoping <draft-stewart-tsvwg-sctp-ipv4-00.txt>. * * Level 0 - unusable SCTP addresses * Level 1 - loopback address * Level 2 - link-local addresses * Level 3 - private addresses. * Level 4 - global addresses * For INIT and INIT-ACK address list, let L be the level of * requested destination address, sender and receiver * SHOULD include all of its addresses with level greater * than or equal to L. * * IPv4 scoping can be controlled through sysctl option * net.sctp.addr_scope_policy */ static enum sctp_scope sctp_v4_scope(union sctp_addr *addr) { enum sctp_scope retval; /* Check for unusable SCTP addresses. */ if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_UNUSABLE; } else if (ipv4_is_loopback(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_LOOPBACK; } else if (ipv4_is_linklocal_169(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_LINK; } else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) || ipv4_is_private_172(addr->v4.sin_addr.s_addr) || ipv4_is_private_192(addr->v4.sin_addr.s_addr) || ipv4_is_test_198(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_PRIVATE; } else { retval = SCTP_SCOPE_GLOBAL; } return retval; } /* Returns a valid dst cache entry for the given source and destination ip * addresses. If an association is passed, trys to get a dst entry with a * source address that matches an address in the bind address list. */ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct flowi *fl, struct sock *sk) { struct sctp_association *asoc = t->asoc; struct rtable *rt; struct flowi _fl; struct flowi4 *fl4 = &_fl.u.ip4; struct sctp_bind_addr *bp; struct sctp_sockaddr_entry *laddr; struct dst_entry *dst = NULL; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; __u8 tos = inet_sk(sk)->tos; if (t->dscp & SCTP_DSCP_SET_MASK) tos = t->dscp & SCTP_DSCP_VAL_MASK; memset(&_fl, 0x0, sizeof(_fl)); fl4->daddr = daddr->v4.sin_addr.s_addr; fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { fl4->flowi4_tos = RT_TOS(tos); fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); } if (saddr) { fl4->saddr = saddr->v4.sin_addr.s_addr; if (!fl4->fl4_sport) fl4->fl4_sport = saddr->v4.sin_port; } pr_debug("%s: dst:%pI4, src:%pI4 - ", __func__, &fl4->daddr, &fl4->saddr); rt = ip_route_output_key(sock_net(sk), fl4); if (!IS_ERR(rt)) { dst = &rt->dst; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); } /* If there is no association or if a source address is passed, no * more validation is required. */ if (!asoc || saddr) goto out; bp = &asoc->base.bind_addr; if (dst) { /* Walk through the bind address list and look for a bind * address that matches the source address of the returned dst. */ sctp_v4_dst_saddr(&dst_saddr, fl4, htons(bp->port)); rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid || (laddr->state == SCTP_ADDR_DEL) || (laddr->state != SCTP_ADDR_SRC && !asoc->src_out_of_asoc_ok)) continue; if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a)) goto out_unlock; } rcu_read_unlock(); /* None of the bound addresses match the source address of the * dst. So release it. */ dst_release(dst); dst = NULL; } /* Walk through the bind address list and try to get a dst that * matches a bind address as the source address. */ rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { struct net_device *odev; if (!laddr->valid) continue; if (laddr->state != SCTP_ADDR_SRC || AF_INET != laddr->a.sa.sa_family) continue; fl4->fl4_sport = laddr->a.v4.sin_port; flowi4_update_output(fl4, asoc->base.sk->sk_bound_dev_if, daddr->v4.sin_addr.s_addr, laddr->a.v4.sin_addr.s_addr); rt = ip_route_output_key(sock_net(sk), fl4); if (IS_ERR(rt)) continue; /* Ensure the src address belongs to the output * interface. */ odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr, false); if (!odev || odev->ifindex != fl4->flowi4_oif) { if (!dst) { dst = &rt->dst; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); } else { dst_release(&rt->dst); } continue; } dst_release(dst); dst = &rt->dst; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); break; } out_unlock: rcu_read_unlock(); out: if (dst) { pr_debug("rt_dst:%pI4, rt_src:%pI4\n", &fl->u.ip4.daddr, &fl->u.ip4.saddr); } else { t->dst = NULL; pr_debug("no route\n"); } } /* For v4, the source address is cached in the route entry(dst). So no need * to cache it separately and hence this is an empty routine. */ static void sctp_v4_get_saddr(struct sctp_sock *sk, struct sctp_transport *t, struct flowi *fl) { union sctp_addr *saddr = &t->saddr; struct rtable *rt = (struct rtable *)t->dst; if (rt) { saddr->v4.sin_family = AF_INET; saddr->v4.sin_addr.s_addr = fl->u.ip4.saddr; } } /* What interface did this skb arrive on? */ static int sctp_v4_skb_iif(const struct sk_buff *skb) { return inet_iif(skb); } static int sctp_v4_skb_sdif(const struct sk_buff *skb) { return inet_sdif(skb); } /* Was this packet marked by Explicit Congestion Notification? */ static int sctp_v4_is_ce(const struct sk_buff *skb) { return INET_ECN_is_ce(ip_hdr(skb)->tos); } /* Create and initialize a new sk for the socket returned by accept(). */ static struct sock *sctp_v4_create_accept_sk(struct sock *sk, struct sctp_association *asoc, bool kern) { struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL, sk->sk_prot, kern); struct inet_sock *newinet; if (!newsk) goto out; sock_init_data(NULL, newsk); sctp_copy_sock(newsk, sk, asoc); sock_reset_flag(newsk, SOCK_ZAPPED); sctp_v4_copy_ip_options(sk, newsk); newinet = inet_sk(newsk); newinet->inet_daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr; if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); newsk = NULL; } out: return newsk; } static int sctp_v4_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) { /* No address mapping for V4 sockets */ memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); return sizeof(struct sockaddr_in); } /* Dump the v4 addr to the seq file. */ static void sctp_v4_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr) { seq_printf(seq, "%pI4 ", &addr->v4.sin_addr); } static void sctp_v4_ecn_capable(struct sock *sk) { INET_ECN_xmit(sk); } static void sctp_addr_wq_timeout_handler(struct timer_list *t) { struct net *net = from_timer(net, t, sctp.addr_wq_timer); struct sctp_sockaddr_entry *addrw, *temp; struct sctp_sock *sp; spin_lock_bh(&net->sctp.addr_wq_lock); list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) { pr_debug("%s: the first ent in wq:%p is addr:%pISc for cmd:%d at " "entry:%p\n", __func__, &net->sctp.addr_waitq, &addrw->a.sa, addrw->state, addrw); #if IS_ENABLED(CONFIG_IPV6) /* Now we send an ASCONF for each association */ /* Note. we currently don't handle link local IPv6 addressees */ if (addrw->a.sa.sa_family == AF_INET6) { struct in6_addr *in6; if (ipv6_addr_type(&addrw->a.v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) goto free_next; in6 = (struct in6_addr *)&addrw->a.v6.sin6_addr; if (ipv6_chk_addr(net, in6, NULL, 0) == 0 && addrw->state == SCTP_ADDR_NEW) { unsigned long timeo_val; pr_debug("%s: this is on DAD, trying %d sec " "later\n", __func__, SCTP_ADDRESS_TICK_DELAY); timeo_val = jiffies; timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY); mod_timer(&net->sctp.addr_wq_timer, timeo_val); break; } } #endif list_for_each_entry(sp, &net->sctp.auto_asconf_splist, auto_asconf_list) { struct sock *sk; sk = sctp_opt2sk(sp); /* ignore bound-specific endpoints */ if (!sctp_is_ep_boundall(sk)) continue; bh_lock_sock(sk); if (sctp_asconf_mgmt(sp, addrw) < 0) pr_debug("%s: sctp_asconf_mgmt failed\n", __func__); bh_unlock_sock(sk); } #if IS_ENABLED(CONFIG_IPV6) free_next: #endif list_del(&addrw->list); kfree(addrw); } spin_unlock_bh(&net->sctp.addr_wq_lock); } static void sctp_free_addr_wq(struct net *net) { struct sctp_sockaddr_entry *addrw; struct sctp_sockaddr_entry *temp; spin_lock_bh(&net->sctp.addr_wq_lock); del_timer(&net->sctp.addr_wq_timer); list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) { list_del(&addrw->list); kfree(addrw); } spin_unlock_bh(&net->sctp.addr_wq_lock); } /* lookup the entry for the same address in the addr_waitq * sctp_addr_wq MUST be locked */ static struct sctp_sockaddr_entry *sctp_addr_wq_lookup(struct net *net, struct sctp_sockaddr_entry *addr) { struct sctp_sockaddr_entry *addrw; list_for_each_entry(addrw, &net->sctp.addr_waitq, list) { if (addrw->a.sa.sa_family != addr->a.sa.sa_family) continue; if (addrw->a.sa.sa_family == AF_INET) { if (addrw->a.v4.sin_addr.s_addr == addr->a.v4.sin_addr.s_addr) return addrw; } else if (addrw->a.sa.sa_family == AF_INET6) { if (ipv6_addr_equal(&addrw->a.v6.sin6_addr, &addr->a.v6.sin6_addr)) return addrw; } } return NULL; } void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cmd) { struct sctp_sockaddr_entry *addrw; unsigned long timeo_val; /* first, we check if an opposite message already exist in the queue. * If we found such message, it is removed. * This operation is a bit stupid, but the DHCP client attaches the * new address after a couple of addition and deletion of that address */ spin_lock_bh(&net->sctp.addr_wq_lock); /* Offsets existing events in addr_wq */ addrw = sctp_addr_wq_lookup(net, addr); if (addrw) { if (addrw->state != cmd) { pr_debug("%s: offsets existing entry for %d, addr:%pISc " "in wq:%p\n", __func__, addrw->state, &addrw->a.sa, &net->sctp.addr_waitq); list_del(&addrw->list); kfree(addrw); } spin_unlock_bh(&net->sctp.addr_wq_lock); return; } /* OK, we have to add the new address to the wait queue */ addrw = kmemdup(addr, sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC); if (addrw == NULL) { spin_unlock_bh(&net->sctp.addr_wq_lock); return; } addrw->state = cmd; list_add_tail(&addrw->list, &net->sctp.addr_waitq); pr_debug("%s: add new entry for cmd:%d, addr:%pISc in wq:%p\n", __func__, addrw->state, &addrw->a.sa, &net->sctp.addr_waitq); if (!timer_pending(&net->sctp.addr_wq_timer)) { timeo_val = jiffies; timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY); mod_timer(&net->sctp.addr_wq_timer, timeo_val); } spin_unlock_bh(&net->sctp.addr_wq_lock); } /* Event handler for inet address addition/deletion events. * The sctp_local_addr_list needs to be protocted by a spin lock since * multiple notifiers (say IPv4 and IPv6) may be running at the same * time and thus corrupt the list. * The reader side is protected with RCU. */ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct sctp_sockaddr_entry *addr = NULL; struct sctp_sockaddr_entry *temp; struct net *net = dev_net(ifa->ifa_dev->dev); int found = 0; switch (ev) { case NETDEV_UP: addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v4.sin_family = AF_INET; addr->a.v4.sin_addr.s_addr = ifa->ifa_local; addr->valid = 1; spin_lock_bh(&net->sctp.local_addr_lock); list_add_tail_rcu(&addr->list, &net->sctp.local_addr_list); sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_NEW); spin_unlock_bh(&net->sctp.local_addr_lock); } break; case NETDEV_DOWN: spin_lock_bh(&net->sctp.local_addr_lock); list_for_each_entry_safe(addr, temp, &net->sctp.local_addr_list, list) { if (addr->a.sa.sa_family == AF_INET && addr->a.v4.sin_addr.s_addr == ifa->ifa_local) { sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL); found = 1; addr->valid = 0; list_del_rcu(&addr->list); break; } } spin_unlock_bh(&net->sctp.local_addr_lock); if (found) kfree_rcu(addr, rcu); break; } return NOTIFY_DONE; } /* * Initialize the control inode/socket with a control endpoint data * structure. This endpoint is reserved exclusively for the OOTB processing. */ static int sctp_ctl_sock_init(struct net *net) { int err; sa_family_t family = PF_INET; if (sctp_get_pf_specific(PF_INET6)) family = PF_INET6; err = inet_ctl_sock_create(&net->sctp.ctl_sock, family, SOCK_SEQPACKET, IPPROTO_SCTP, net); /* If IPv6 socket could not be created, try the IPv4 socket */ if (err < 0 && family == PF_INET6) err = inet_ctl_sock_create(&net->sctp.ctl_sock, AF_INET, SOCK_SEQPACKET, IPPROTO_SCTP, net); if (err < 0) { pr_err("Failed to create the SCTP control socket\n"); return err; } return 0; } static int sctp_udp_rcv(struct sock *sk, struct sk_buff *skb) { SCTP_INPUT_CB(skb)->encap_port = udp_hdr(skb)->source; skb_set_transport_header(skb, sizeof(struct udphdr)); sctp_rcv(skb); return 0; } int sctp_udp_sock_start(struct net *net) { struct udp_tunnel_sock_cfg tuncfg = {NULL}; struct udp_port_cfg udp_conf = {0}; struct socket *sock; int err; udp_conf.family = AF_INET; udp_conf.local_ip.s_addr = htonl(INADDR_ANY); udp_conf.local_udp_port = htons(net->sctp.udp_port); err = udp_sock_create(net, &udp_conf, &sock); if (err) { pr_err("Failed to create the SCTP UDP tunneling v4 sock\n"); return err; } tuncfg.encap_type = 1; tuncfg.encap_rcv = sctp_udp_rcv; tuncfg.encap_err_lookup = sctp_udp_v4_err; setup_udp_tunnel_sock(net, sock, &tuncfg); net->sctp.udp4_sock = sock->sk; #if IS_ENABLED(CONFIG_IPV6) memset(&udp_conf, 0, sizeof(udp_conf)); udp_conf.family = AF_INET6; udp_conf.local_ip6 = in6addr_any; udp_conf.local_udp_port = htons(net->sctp.udp_port); udp_conf.use_udp6_rx_checksums = true; udp_conf.ipv6_v6only = true; err = udp_sock_create(net, &udp_conf, &sock); if (err) { pr_err("Failed to create the SCTP UDP tunneling v6 sock\n"); udp_tunnel_sock_release(net->sctp.udp4_sock->sk_socket); net->sctp.udp4_sock = NULL; return err; } tuncfg.encap_type = 1; tuncfg.encap_rcv = sctp_udp_rcv; tuncfg.encap_err_lookup = sctp_udp_v6_err; setup_udp_tunnel_sock(net, sock, &tuncfg); net->sctp.udp6_sock = sock->sk; #endif return 0; } void sctp_udp_sock_stop(struct net *net) { if (net->sctp.udp4_sock) { udp_tunnel_sock_release(net->sctp.udp4_sock->sk_socket); net->sctp.udp4_sock = NULL; } if (net->sctp.udp6_sock) { udp_tunnel_sock_release(net->sctp.udp6_sock->sk_socket); net->sctp.udp6_sock = NULL; } } /* Register address family specific functions. */ int sctp_register_af(struct sctp_af *af) { switch (af->sa_family) { case AF_INET: if (sctp_af_v4_specific) return 0; sctp_af_v4_specific = af; break; case AF_INET6: if (sctp_af_v6_specific) return 0; sctp_af_v6_specific = af; break; default: return 0; } INIT_LIST_HEAD(&af->list); list_add_tail(&af->list, &sctp_address_families); return 1; } /* Get the table of functions for manipulating a particular address * family. */ struct sctp_af *sctp_get_af_specific(sa_family_t family) { switch (family) { case AF_INET: return sctp_af_v4_specific; case AF_INET6: return sctp_af_v6_specific; default: return NULL; } } /* Common code to initialize a AF_INET msg_name. */ static void sctp_inet_msgname(char *msgname, int *addr_len) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)msgname; *addr_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); } /* Copy the primary address of the peer primary address as the msg_name. */ static void sctp_inet_event_msgname(struct sctp_ulpevent *event, char *msgname, int *addr_len) { struct sockaddr_in *sin, *sinfrom; if (msgname) { struct sctp_association *asoc; asoc = event->asoc; sctp_inet_msgname(msgname, addr_len); sin = (struct sockaddr_in *)msgname; sinfrom = &asoc->peer.primary_addr.v4; sin->sin_port = htons(asoc->peer.port); sin->sin_addr.s_addr = sinfrom->sin_addr.s_addr; } } /* Initialize and copy out a msgname from an inbound skb. */ static void sctp_inet_skb_msgname(struct sk_buff *skb, char *msgname, int *len) { if (msgname) { struct sctphdr *sh = sctp_hdr(skb); struct sockaddr_in *sin = (struct sockaddr_in *)msgname; sctp_inet_msgname(msgname, len); sin->sin_port = sh->source; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; } } /* Do we support this AF? */ static int sctp_inet_af_supported(sa_family_t family, struct sctp_sock *sp) { /* PF_INET only supports AF_INET addresses. */ return AF_INET == family; } /* Address matching with wildcards allowed. */ static int sctp_inet_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2, struct sctp_sock *opt) { /* PF_INET only supports AF_INET addresses. */ if (addr1->sa.sa_family != addr2->sa.sa_family) return 0; if (htonl(INADDR_ANY) == addr1->v4.sin_addr.s_addr || htonl(INADDR_ANY) == addr2->v4.sin_addr.s_addr) return 1; if (addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr) return 1; return 0; } /* Verify that provided sockaddr looks bindable. Common verification has * already been taken care of. */ static int sctp_inet_bind_verify(struct sctp_sock *opt, union sctp_addr *addr) { return sctp_v4_available(addr, opt); } /* Verify that sockaddr looks sendable. Common verification has already * been taken care of. */ static int sctp_inet_send_verify(struct sctp_sock *opt, union sctp_addr *addr) { return 1; } /* Fill in Supported Address Type information for INIT and INIT-ACK * chunks. Returns number of addresses supported. */ static int sctp_inet_supported_addrs(const struct sctp_sock *opt, __be16 *types) { types[0] = SCTP_PARAM_IPV4_ADDRESS; return 1; } /* Wrapper routine that calls the ip transmit routine. */ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t) { struct dst_entry *dst = dst_clone(t->dst); struct flowi4 *fl4 = &t->fl.u.ip4; struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); __u8 dscp = inet->tos; __be16 df = 0; pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb, skb->len, &fl4->saddr, &fl4->daddr); if (t->dscp & SCTP_DSCP_SET_MASK) dscp = t->dscp & SCTP_DSCP_VAL_MASK; inet->pmtudisc = t->param_flags & SPP_PMTUD_ENABLE ? IP_PMTUDISC_DO : IP_PMTUDISC_DONT; SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); if (!t->encap_port || !sctp_sk(sk)->udp_port) { skb_dst_set(skb, dst); return __ip_queue_xmit(sk, skb, &t->fl, dscp); } if (skb_is_gso(skb)) skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; if (ip_dont_fragment(sk, dst) && !skb->ignore_df) df = htons(IP_DF); skb->encapsulation = 1; skb_reset_inner_mac_header(skb); skb_reset_inner_transport_header(skb); skb_set_inner_ipproto(skb, IPPROTO_SCTP); udp_tunnel_xmit_skb((struct rtable *)dst, sk, skb, fl4->saddr, fl4->daddr, dscp, ip4_dst_hoplimit(dst), df, sctp_sk(sk)->udp_port, t->encap_port, false, false); return 0; } static struct sctp_af sctp_af_inet; static struct sctp_pf sctp_pf_inet = { .event_msgname = sctp_inet_event_msgname, .skb_msgname = sctp_inet_skb_msgname, .af_supported = sctp_inet_af_supported, .cmp_addr = sctp_inet_cmp_addr, .bind_verify = sctp_inet_bind_verify, .send_verify = sctp_inet_send_verify, .supported_addrs = sctp_inet_supported_addrs, .create_accept_sk = sctp_v4_create_accept_sk, .addr_to_user = sctp_v4_addr_to_user, .to_sk_saddr = sctp_v4_to_sk_saddr, .to_sk_daddr = sctp_v4_to_sk_daddr, .copy_ip_options = sctp_v4_copy_ip_options, .af = &sctp_af_inet }; /* Notifier for inetaddr addition/deletion events. */ static struct notifier_block sctp_inetaddr_notifier = { .notifier_call = sctp_inetaddr_event, }; /* Socket operations. */ static const struct proto_ops inet_seqpacket_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, /* Needs to be wrapped... */ .bind = inet_bind, .connect = sctp_inet_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, /* Semantics are different. */ .poll = sctp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = sctp_inet_listen, .shutdown = inet_shutdown, /* Looks harmless. */ .setsockopt = sock_common_setsockopt, /* IP_SOL IP_OPTION is a problem */ .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, }; /* Registration with AF_INET family. */ static struct inet_protosw sctp_seqpacket_protosw = { .type = SOCK_SEQPACKET, .protocol = IPPROTO_SCTP, .prot = &sctp_prot, .ops = &inet_seqpacket_ops, .flags = SCTP_PROTOSW_FLAG }; static struct inet_protosw sctp_stream_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_SCTP, .prot = &sctp_prot, .ops = &inet_seqpacket_ops, .flags = SCTP_PROTOSW_FLAG }; static int sctp4_rcv(struct sk_buff *skb) { SCTP_INPUT_CB(skb)->encap_port = 0; return sctp_rcv(skb); } /* Register with IP layer. */ static const struct net_protocol sctp_protocol = { .handler = sctp4_rcv, .err_handler = sctp_v4_err, .no_policy = 1, .icmp_strict_tag_validation = 1, }; /* IPv4 address related functions. */ static struct sctp_af sctp_af_inet = { .sa_family = AF_INET, .sctp_xmit = sctp_v4_xmit, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .get_dst = sctp_v4_get_dst, .get_saddr = sctp_v4_get_saddr, .copy_addrlist = sctp_v4_copy_addrlist, .from_skb = sctp_v4_from_skb, .from_sk = sctp_v4_from_sk, .from_addr_param = sctp_v4_from_addr_param, .to_addr_param = sctp_v4_to_addr_param, .cmp_addr = sctp_v4_cmp_addr, .addr_valid = sctp_v4_addr_valid, .inaddr_any = sctp_v4_inaddr_any, .is_any = sctp_v4_is_any, .available = sctp_v4_available, .scope = sctp_v4_scope, .skb_iif = sctp_v4_skb_iif, .skb_sdif = sctp_v4_skb_sdif, .is_ce = sctp_v4_is_ce, .seq_dump_addr = sctp_v4_seq_dump_addr, .ecn_capable = sctp_v4_ecn_capable, .net_header_len = sizeof(struct iphdr), .sockaddr_len = sizeof(struct sockaddr_in), .ip_options_len = sctp_v4_ip_options_len, }; struct sctp_pf *sctp_get_pf_specific(sa_family_t family) { switch (family) { case PF_INET: return sctp_pf_inet_specific; case PF_INET6: return sctp_pf_inet6_specific; default: return NULL; } } /* Register the PF specific function table. */ int sctp_register_pf(struct sctp_pf *pf, sa_family_t family) { switch (family) { case PF_INET: if (sctp_pf_inet_specific) return 0; sctp_pf_inet_specific = pf; break; case PF_INET6: if (sctp_pf_inet6_specific) return 0; sctp_pf_inet6_specific = pf; break; default: return 0; } return 1; } static inline int init_sctp_mibs(struct net *net) { net->sctp.sctp_statistics = alloc_percpu(struct sctp_mib); if (!net->sctp.sctp_statistics) return -ENOMEM; return 0; } static inline void cleanup_sctp_mibs(struct net *net) { free_percpu(net->sctp.sctp_statistics); } static void sctp_v4_pf_init(void) { /* Initialize the SCTP specific PF functions. */ sctp_register_pf(&sctp_pf_inet, PF_INET); sctp_register_af(&sctp_af_inet); } static void sctp_v4_pf_exit(void) { list_del(&sctp_af_inet.list); } static int sctp_v4_protosw_init(void) { int rc; rc = proto_register(&sctp_prot, 1); if (rc) return rc; /* Register SCTP(UDP and TCP style) with socket layer. */ inet_register_protosw(&sctp_seqpacket_protosw); inet_register_protosw(&sctp_stream_protosw); return 0; } static void sctp_v4_protosw_exit(void) { inet_unregister_protosw(&sctp_stream_protosw); inet_unregister_protosw(&sctp_seqpacket_protosw); proto_unregister(&sctp_prot); } static int sctp_v4_add_protocol(void) { /* Register notifier for inet address additions/deletions. */ register_inetaddr_notifier(&sctp_inetaddr_notifier); /* Register SCTP with inet layer. */ if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0) return -EAGAIN; return 0; } static void sctp_v4_del_protocol(void) { inet_del_protocol(&sctp_protocol, IPPROTO_SCTP); unregister_inetaddr_notifier(&sctp_inetaddr_notifier); } static int __net_init sctp_defaults_init(struct net *net) { int status; /* * 14. Suggested SCTP Protocol Parameter Values */ /* The following protocol parameters are RECOMMENDED: */ /* RTO.Initial - 3 seconds */ net->sctp.rto_initial = SCTP_RTO_INITIAL; /* RTO.Min - 1 second */ net->sctp.rto_min = SCTP_RTO_MIN; /* RTO.Max - 60 seconds */ net->sctp.rto_max = SCTP_RTO_MAX; /* RTO.Alpha - 1/8 */ net->sctp.rto_alpha = SCTP_RTO_ALPHA; /* RTO.Beta - 1/4 */ net->sctp.rto_beta = SCTP_RTO_BETA; /* Valid.Cookie.Life - 60 seconds */ net->sctp.valid_cookie_life = SCTP_DEFAULT_COOKIE_LIFE; /* Whether Cookie Preservative is enabled(1) or not(0) */ net->sctp.cookie_preserve_enable = 1; /* Default sctp sockets to use md5 as their hmac alg */ #if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5) net->sctp.sctp_hmac_alg = "md5"; #elif defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1) net->sctp.sctp_hmac_alg = "sha1"; #else net->sctp.sctp_hmac_alg = NULL; #endif /* Max.Burst - 4 */ net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST; /* Disable of Primary Path Switchover by default */ net->sctp.ps_retrans = SCTP_PS_RETRANS_MAX; /* Enable pf state by default */ net->sctp.pf_enable = 1; /* Ignore pf exposure feature by default */ net->sctp.pf_expose = SCTP_PF_EXPOSE_UNSET; /* Association.Max.Retrans - 10 attempts * Path.Max.Retrans - 5 attempts (per destination address) * Max.Init.Retransmits - 8 attempts */ net->sctp.max_retrans_association = 10; net->sctp.max_retrans_path = 5; net->sctp.max_retrans_init = 8; /* Sendbuffer growth - do per-socket accounting */ net->sctp.sndbuf_policy = 0; /* Rcvbuffer growth - do per-socket accounting */ net->sctp.rcvbuf_policy = 0; /* HB.interval - 30 seconds */ net->sctp.hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT; /* delayed SACK timeout */ net->sctp.sack_timeout = SCTP_DEFAULT_TIMEOUT_SACK; /* Disable ADDIP by default. */ net->sctp.addip_enable = 0; net->sctp.addip_noauth = 0; net->sctp.default_auto_asconf = 0; /* Enable PR-SCTP by default. */ net->sctp.prsctp_enable = 1; /* Disable RECONF by default. */ net->sctp.reconf_enable = 0; /* Disable AUTH by default. */ net->sctp.auth_enable = 0; /* Enable ECN by default. */ net->sctp.ecn_enable = 1; /* Set UDP tunneling listening port to 0 by default */ net->sctp.udp_port = 0; /* Set remote encap port to 0 by default */ net->sctp.encap_port = 0; /* Set SCOPE policy to enabled */ net->sctp.scope_policy = SCTP_SCOPE_POLICY_ENABLE; /* Set the default rwnd update threshold */ net->sctp.rwnd_upd_shift = SCTP_DEFAULT_RWND_SHIFT; /* Initialize maximum autoclose timeout. */ net->sctp.max_autoclose = INT_MAX / HZ; #ifdef CONFIG_NET_L3_MASTER_DEV net->sctp.l3mdev_accept = 1; #endif status = sctp_sysctl_net_register(net); if (status) goto err_sysctl_register; /* Allocate and initialise sctp mibs. */ status = init_sctp_mibs(net); if (status) goto err_init_mibs; #ifdef CONFIG_PROC_FS /* Initialize proc fs directory. */ status = sctp_proc_init(net); if (status) goto err_init_proc; #endif sctp_dbg_objcnt_init(net); /* Initialize the local address list. */ INIT_LIST_HEAD(&net->sctp.local_addr_list); spin_lock_init(&net->sctp.local_addr_lock); sctp_get_local_addr_list(net); /* Initialize the address event list */ INIT_LIST_HEAD(&net->sctp.addr_waitq); INIT_LIST_HEAD(&net->sctp.auto_asconf_splist); spin_lock_init(&net->sctp.addr_wq_lock); net->sctp.addr_wq_timer.expires = 0; timer_setup(&net->sctp.addr_wq_timer, sctp_addr_wq_timeout_handler, 0); return 0; #ifdef CONFIG_PROC_FS err_init_proc: cleanup_sctp_mibs(net); #endif err_init_mibs: sctp_sysctl_net_unregister(net); err_sysctl_register: return status; } static void __net_exit sctp_defaults_exit(struct net *net) { /* Free the local address list */ sctp_free_addr_wq(net); sctp_free_local_addr_list(net); #ifdef CONFIG_PROC_FS remove_proc_subtree("sctp", net->proc_net); net->sctp.proc_net_sctp = NULL; #endif cleanup_sctp_mibs(net); sctp_sysctl_net_unregister(net); } static struct pernet_operations sctp_defaults_ops = { .init = sctp_defaults_init, .exit = sctp_defaults_exit, }; static int __net_init sctp_ctrlsock_init(struct net *net) { int status; /* Initialize the control inode/socket for handling OOTB packets. */ status = sctp_ctl_sock_init(net); if (status) pr_err("Failed to initialize the SCTP control sock\n"); return status; } static void __net_exit sctp_ctrlsock_exit(struct net *net) { /* Free the control endpoint. */ inet_ctl_sock_destroy(net->sctp.ctl_sock); } static struct pernet_operations sctp_ctrlsock_ops = { .init = sctp_ctrlsock_init, .exit = sctp_ctrlsock_exit, }; /* Initialize the universe into something sensible. */ static __init int sctp_init(void) { unsigned long nr_pages = totalram_pages(); unsigned long limit; unsigned long goal; int max_entry_order; int num_entries; int max_share; int status; int order; int i; sock_skb_cb_check_size(sizeof(struct sctp_ulpevent)); /* Allocate bind_bucket and chunk caches. */ status = -ENOBUFS; sctp_bucket_cachep = kmem_cache_create("sctp_bind_bucket", sizeof(struct sctp_bind_bucket), 0, SLAB_HWCACHE_ALIGN, NULL); if (!sctp_bucket_cachep) goto out; sctp_chunk_cachep = kmem_cache_create("sctp_chunk", sizeof(struct sctp_chunk), 0, SLAB_HWCACHE_ALIGN, NULL); if (!sctp_chunk_cachep) goto err_chunk_cachep; status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL); if (status) goto err_percpu_counter_init; /* Implementation specific variables. */ /* Initialize default stream count setup information. */ sctp_max_instreams = SCTP_DEFAULT_INSTREAMS; sctp_max_outstreams = SCTP_DEFAULT_OUTSTREAMS; /* Initialize handle used for association ids. */ idr_init(&sctp_assocs_id); limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); sysctl_sctp_mem[0] = limit / 4 * 3; sysctl_sctp_mem[1] = limit; sysctl_sctp_mem[2] = sysctl_sctp_mem[0] * 2; /* Set per-socket limits to no more than 1/128 the pressure threshold*/ limit = (sysctl_sctp_mem[1]) << (PAGE_SHIFT - 7); max_share = min(4UL*1024*1024, limit); sysctl_sctp_rmem[0] = PAGE_SIZE; /* give each asoc 1 page min */ sysctl_sctp_rmem[1] = 1500 * SKB_TRUESIZE(1); sysctl_sctp_rmem[2] = max(sysctl_sctp_rmem[1], max_share); sysctl_sctp_wmem[0] = PAGE_SIZE; sysctl_sctp_wmem[1] = 16*1024; sysctl_sctp_wmem[2] = max(64*1024, max_share); /* Size and allocate the association hash table. * The methodology is similar to that of the tcp hash tables. * Though not identical. Start by getting a goal size */ if (nr_pages >= (128 * 1024)) goal = nr_pages >> (22 - PAGE_SHIFT); else goal = nr_pages >> (24 - PAGE_SHIFT); /* Then compute the page order for said goal */ order = get_order(goal); /* Now compute the required page order for the maximum sized table we * want to create */ max_entry_order = get_order(MAX_SCTP_PORT_HASH_ENTRIES * sizeof(struct sctp_bind_hashbucket)); /* Limit the page order by that maximum hash table size */ order = min(order, max_entry_order); /* Allocate and initialize the endpoint hash table. */ sctp_ep_hashsize = 64; sctp_ep_hashtable = kmalloc_array(64, sizeof(struct sctp_hashbucket), GFP_KERNEL); if (!sctp_ep_hashtable) { pr_err("Failed endpoint_hash alloc\n"); status = -ENOMEM; goto err_ehash_alloc; } for (i = 0; i < sctp_ep_hashsize; i++) { rwlock_init(&sctp_ep_hashtable[i].lock); INIT_HLIST_HEAD(&sctp_ep_hashtable[i].chain); } /* Allocate and initialize the SCTP port hash table. * Note that order is initalized to start at the max sized * table we want to support. If we can't get that many pages * reduce the order and try again */ do { sctp_port_hashtable = (struct sctp_bind_hashbucket *) __get_free_pages(GFP_KERNEL | __GFP_NOWARN, order); } while (!sctp_port_hashtable && --order > 0); if (!sctp_port_hashtable) { pr_err("Failed bind hash alloc\n"); status = -ENOMEM; goto err_bhash_alloc; } /* Now compute the number of entries that will fit in the * port hash space we allocated */ num_entries = (1UL << order) * PAGE_SIZE / sizeof(struct sctp_bind_hashbucket); /* And finish by rounding it down to the nearest power of two. * This wastes some memory of course, but it's needed because * the hash function operates based on the assumption that * the number of entries is a power of two. */ sctp_port_hashsize = rounddown_pow_of_two(num_entries); for (i = 0; i < sctp_port_hashsize; i++) { spin_lock_init(&sctp_port_hashtable[i].lock); INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); } status = sctp_transport_hashtable_init(); if (status) goto err_thash_alloc; pr_info("Hash tables configured (bind %d/%d)\n", sctp_port_hashsize, num_entries); sctp_sysctl_register(); INIT_LIST_HEAD(&sctp_address_families); sctp_v4_pf_init(); sctp_v6_pf_init(); sctp_sched_ops_init(); status = register_pernet_subsys(&sctp_defaults_ops); if (status) goto err_register_defaults; status = sctp_v4_protosw_init(); if (status) goto err_protosw_init; status = sctp_v6_protosw_init(); if (status) goto err_v6_protosw_init; status = register_pernet_subsys(&sctp_ctrlsock_ops); if (status) goto err_register_ctrlsock; status = sctp_v4_add_protocol(); if (status) goto err_add_protocol; /* Register SCTP with inet6 layer. */ status = sctp_v6_add_protocol(); if (status) goto err_v6_add_protocol; if (sctp_offload_init() < 0) pr_crit("%s: Cannot add SCTP protocol offload\n", __func__); out: return status; err_v6_add_protocol: sctp_v4_del_protocol(); err_add_protocol: unregister_pernet_subsys(&sctp_ctrlsock_ops); err_register_ctrlsock: sctp_v6_protosw_exit(); err_v6_protosw_init: sctp_v4_protosw_exit(); err_protosw_init: unregister_pernet_subsys(&sctp_defaults_ops); err_register_defaults: sctp_v4_pf_exit(); sctp_v6_pf_exit(); sctp_sysctl_unregister(); free_pages((unsigned long)sctp_port_hashtable, get_order(sctp_port_hashsize * sizeof(struct sctp_bind_hashbucket))); err_bhash_alloc: sctp_transport_hashtable_destroy(); err_thash_alloc: kfree(sctp_ep_hashtable); err_ehash_alloc: percpu_counter_destroy(&sctp_sockets_allocated); err_percpu_counter_init: kmem_cache_destroy(sctp_chunk_cachep); err_chunk_cachep: kmem_cache_destroy(sctp_bucket_cachep); goto out; } /* Exit handler for the SCTP protocol. */ static __exit void sctp_exit(void) { /* BUG. This should probably do something useful like clean * up all the remaining associations and all that memory. */ /* Unregister with inet6/inet layers. */ sctp_v6_del_protocol(); sctp_v4_del_protocol(); unregister_pernet_subsys(&sctp_ctrlsock_ops); /* Free protosw registrations */ sctp_v6_protosw_exit(); sctp_v4_protosw_exit(); unregister_pernet_subsys(&sctp_defaults_ops); /* Unregister with socket layer. */ sctp_v6_pf_exit(); sctp_v4_pf_exit(); sctp_sysctl_unregister(); free_pages((unsigned long)sctp_port_hashtable, get_order(sctp_port_hashsize * sizeof(struct sctp_bind_hashbucket))); kfree(sctp_ep_hashtable); sctp_transport_hashtable_destroy(); percpu_counter_destroy(&sctp_sockets_allocated); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(sctp_chunk_cachep); kmem_cache_destroy(sctp_bucket_cachep); } module_init(sctp_init); module_exit(sctp_exit); /* * __stringify doesn't likes enums, so use IPPROTO_SCTP value (132) directly. */ MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132"); MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132"); MODULE_AUTHOR("Linux Kernel SCTP developers <linux-sctp@vger.kernel.org>"); MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)"); module_param_named(no_checksums, sctp_checksum_disable, bool, 0644); MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification"); MODULE_LICENSE("GPL");
1640 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 /* SPDX-License-Identifier: GPL-2.0 */ /* * * Definitions for mount interface. This describes the in the kernel build * linkedlist with mounted filesystems. * * Author: Marco van Wieringen <mvw@planets.elm.net> * */ #ifndef _LINUX_MOUNT_H #define _LINUX_MOUNT_H #include <linux/types.h> #include <asm/barrier.h> struct super_block; struct dentry; struct user_namespace; struct mnt_idmap; struct file_system_type; struct fs_context; struct file; struct path; #define MNT_NOSUID 0x01 #define MNT_NODEV 0x02 #define MNT_NOEXEC 0x04 #define MNT_NOATIME 0x08 #define MNT_NODIRATIME 0x10 #define MNT_RELATIME 0x20 #define MNT_READONLY 0x40 /* does the user want this to be r/o? */ #define MNT_NOSYMFOLLOW 0x80 #define MNT_SHRINKABLE 0x100 #define MNT_WRITE_HOLD 0x200 #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ /* * MNT_SHARED_MASK is the set of flags that should be cleared when a * mount becomes shared. Currently, this is only the flag that says a * mount cannot be bind mounted, since this is how we create a mount * that shares events with another mount. If you add a new MNT_* * flag, consider how it interacts with shared mounts. */ #define MNT_SHARED_MASK (MNT_UNBINDABLE) #define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \ | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \ | MNT_READONLY | MNT_NOSYMFOLLOW) #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \ MNT_CURSOR) #define MNT_INTERNAL 0x4000 #define MNT_LOCK_ATIME 0x040000 #define MNT_LOCK_NOEXEC 0x080000 #define MNT_LOCK_NOSUID 0x100000 #define MNT_LOCK_NODEV 0x200000 #define MNT_LOCK_READONLY 0x400000 #define MNT_LOCKED 0x800000 #define MNT_DOOMED 0x1000000 #define MNT_SYNC_UMOUNT 0x2000000 #define MNT_MARKED 0x4000000 #define MNT_UMOUNT 0x8000000 #define MNT_CURSOR 0x10000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ int mnt_flags; struct mnt_idmap *mnt_idmap; } __randomize_layout; static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt) { /* Pairs with smp_store_release() in do_idmap_mount(). */ return smp_load_acquire(&mnt->mnt_idmap); } extern int mnt_want_write(struct vfsmount *mnt); extern int mnt_want_write_file(struct file *file); extern void mnt_drop_write(struct vfsmount *mnt); extern void mnt_drop_write_file(struct file *file); extern void mntput(struct vfsmount *mnt); extern struct vfsmount *mntget(struct vfsmount *mnt); extern void mnt_make_shortterm(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(const struct path *path); extern bool __mnt_is_readonly(struct vfsmount *mnt); extern bool mnt_may_suid(struct vfsmount *mnt); extern struct vfsmount *clone_private_mount(const struct path *path); extern int __mnt_want_write(struct vfsmount *); extern void __mnt_drop_write(struct vfsmount *); extern struct vfsmount *fc_mount(struct fs_context *fc); extern struct vfsmount *vfs_create_mount(struct fs_context *fc); extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data); extern struct vfsmount *vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, const char *name, void *data); extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mark_mounts_for_expiry(struct list_head *mounts); extern bool path_is_mountpoint(const struct path *path); extern bool our_mnt(struct vfsmount *mnt); extern struct vfsmount *kern_mount(struct file_system_type *); extern void kern_unmount(struct vfsmount *mnt); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); extern long do_mount(const char *, const char __user *, const char *, unsigned long, void *); extern struct vfsmount *collect_mounts(const struct path *); extern void drop_collected_mounts(struct vfsmount *); extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, struct vfsmount *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); #endif /* _LINUX_MOUNT_H */
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 // SPDX-License-Identifier: GPL-2.0-or-later /* * connector.c * * 2004+ Copyright (c) Evgeniy Polyakov <zbr@ioremap.net> * All rights reserved. */ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/list.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <linux/moduleparam.h> #include <linux/connector.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/proc_fs.h> #include <linux/spinlock.h> #include <net/sock.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); MODULE_DESCRIPTION("Generic userspace <-> kernelspace connector."); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_CONNECTOR); static struct cn_dev cdev; static int cn_already_initialized; /* * Sends mult (multiple) cn_msg at a time. * * msg->seq and msg->ack are used to determine message genealogy. * When someone sends message it puts there locally unique sequence * and random acknowledge numbers. Sequence number may be copied into * nlmsghdr->nlmsg_seq too. * * Sequence number is incremented with each message to be sent. * * If we expect a reply to our message then the sequence number in * received message MUST be the same as in original message, and * acknowledge number MUST be the same + 1. * * If we receive a message and its sequence number is not equal to the * one we are expecting then it is a new message. * * If we receive a message and its sequence number is the same as one * we are expecting but it's acknowledgement number is not equal to * the acknowledgement number in the original message + 1, then it is * a new message. * * If msg->len != len, then additional cn_msg messages are expected following * the first msg. * * The message is sent to, the portid if given, the group if given, both if * both, or if both are zero then the group is looked up and sent there. */ int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 __group, gfp_t gfp_mask, int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data), void *filter_data) { struct cn_callback_entry *__cbq; unsigned int size; struct sk_buff *skb; struct nlmsghdr *nlh; struct cn_msg *data; struct cn_dev *dev = &cdev; u32 group = 0; int found = 0; if (portid || __group) { group = __group; } else { spin_lock_bh(&dev->cbdev->queue_lock); list_for_each_entry(__cbq, &dev->cbdev->queue_list, callback_entry) { if (cn_cb_equal(&__cbq->id.id, &msg->id)) { found = 1; group = __cbq->group; break; } } spin_unlock_bh(&dev->cbdev->queue_lock); if (!found) return -ENODEV; } if (!portid && !netlink_has_listeners(dev->nls, group)) return -ESRCH; size = sizeof(*msg) + len; skb = nlmsg_new(size, gfp_mask); if (!skb) return -ENOMEM; nlh = nlmsg_put(skb, 0, msg->seq, NLMSG_DONE, size, 0); if (!nlh) { kfree_skb(skb); return -EMSGSIZE; } data = nlmsg_data(nlh); memcpy(data, msg, size); NETLINK_CB(skb).dst_group = group; if (group) return netlink_broadcast_filtered(dev->nls, skb, portid, group, gfp_mask, filter, (void *)filter_data); return netlink_unicast(dev->nls, skb, portid, !gfpflags_allow_blocking(gfp_mask)); } EXPORT_SYMBOL_GPL(cn_netlink_send_mult); /* same as cn_netlink_send_mult except msg->len is used for len */ int cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, gfp_t gfp_mask) { return cn_netlink_send_mult(msg, msg->len, portid, __group, gfp_mask, NULL, NULL); } EXPORT_SYMBOL_GPL(cn_netlink_send); /* * Callback helper - queues work and setup destructor for given data. */ static int cn_call_callback(struct sk_buff *skb) { struct nlmsghdr *nlh; struct cn_callback_entry *i, *cbq = NULL; struct cn_dev *dev = &cdev; struct cn_msg *msg = nlmsg_data(nlmsg_hdr(skb)); struct netlink_skb_parms *nsp = &NETLINK_CB(skb); int err = -ENODEV; /* verify msg->len is within skb */ nlh = nlmsg_hdr(skb); if (nlh->nlmsg_len < NLMSG_HDRLEN + sizeof(struct cn_msg) + msg->len) return -EINVAL; spin_lock_bh(&dev->cbdev->queue_lock); list_for_each_entry(i, &dev->cbdev->queue_list, callback_entry) { if (cn_cb_equal(&i->id.id, &msg->id)) { refcount_inc(&i->refcnt); cbq = i; break; } } spin_unlock_bh(&dev->cbdev->queue_lock); if (cbq != NULL) { cbq->callback(msg, nsp); kfree_skb(skb); cn_queue_release_callback(cbq); err = 0; } return err; } /* * Allow non-root access for NETLINK_CONNECTOR family having CN_IDX_PROC * multicast group. */ static int cn_bind(struct net *net, int group) { unsigned long groups = (unsigned long) group; if (ns_capable(net->user_ns, CAP_NET_ADMIN)) return 0; if (test_bit(CN_IDX_PROC - 1, &groups)) return 0; return -EPERM; } static void cn_release(struct sock *sk, unsigned long *groups) { if (groups && test_bit(CN_IDX_PROC - 1, groups)) { kfree(sk->sk_user_data); sk->sk_user_data = NULL; } } /* * Main netlink receiving function. * * It checks skb, netlink header and msg sizes, and calls callback helper. */ static void cn_rx_skb(struct sk_buff *skb) { struct nlmsghdr *nlh; int len, err; if (skb->len >= NLMSG_HDRLEN) { nlh = nlmsg_hdr(skb); len = nlmsg_len(nlh); if (len < (int)sizeof(struct cn_msg) || skb->len < nlh->nlmsg_len || len > CONNECTOR_MAX_MSG_SIZE) return; err = cn_call_callback(skb_get(skb)); if (err < 0) kfree_skb(skb); } } /* * Callback add routing - adds callback with given ID and name. * If there is registered callback with the same ID it will not be added. * * May sleep. */ int cn_add_callback(const struct cb_id *id, const char *name, void (*callback)(struct cn_msg *, struct netlink_skb_parms *)) { struct cn_dev *dev = &cdev; if (!cn_already_initialized) return -EAGAIN; return cn_queue_add_callback(dev->cbdev, name, id, callback); } EXPORT_SYMBOL_GPL(cn_add_callback); /* * Callback remove routing - removes callback * with given ID. * If there is no registered callback with given * ID nothing happens. * * May sleep while waiting for reference counter to become zero. */ void cn_del_callback(const struct cb_id *id) { struct cn_dev *dev = &cdev; cn_queue_del_callback(dev->cbdev, id); } EXPORT_SYMBOL_GPL(cn_del_callback); static int __maybe_unused cn_proc_show(struct seq_file *m, void *v) { struct cn_queue_dev *dev = cdev.cbdev; struct cn_callback_entry *cbq; seq_printf(m, "Name ID\n"); spin_lock_bh(&dev->queue_lock); list_for_each_entry(cbq, &dev->queue_list, callback_entry) { seq_printf(m, "%-15s %u:%u\n", cbq->id.name, cbq->id.id.idx, cbq->id.id.val); } spin_unlock_bh(&dev->queue_lock); return 0; } static int cn_init(void) { struct cn_dev *dev = &cdev; struct netlink_kernel_cfg cfg = { .groups = CN_NETLINK_USERS + 0xf, .input = cn_rx_skb, .flags = NL_CFG_F_NONROOT_RECV, .bind = cn_bind, .release = cn_release, }; dev->nls = netlink_kernel_create(&init_net, NETLINK_CONNECTOR, &cfg); if (!dev->nls) return -EIO; dev->cbdev = cn_queue_alloc_dev("cqueue", dev->nls); if (!dev->cbdev) { netlink_kernel_release(dev->nls); return -EINVAL; } cn_already_initialized = 1; proc_create_single("connector", S_IRUGO, init_net.proc_net, cn_proc_show); return 0; } static void cn_fini(void) { struct cn_dev *dev = &cdev; cn_already_initialized = 0; remove_proc_entry("connector", init_net.proc_net); cn_queue_free_dev(dev->cbdev); netlink_kernel_release(dev->nls); } subsys_initcall(cn_init); module_exit(cn_fini);
56 8 3 3 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DEBUGREG_H #define _ASM_X86_DEBUGREG_H #include <linux/bug.h> #include <linux/percpu.h> #include <uapi/asm/debugreg.h> DECLARE_PER_CPU(unsigned long, cpu_dr7); #ifndef CONFIG_PARAVIRT_XXL /* * These special macros can be used to get or set a debugging register */ #define get_debugreg(var, register) \ (var) = native_get_debugreg(register) #define set_debugreg(value, register) \ native_set_debugreg(register, value) #endif static __always_inline unsigned long native_get_debugreg(int regno) { unsigned long val = 0; /* Damn you, gcc! */ switch (regno) { case 0: asm("mov %%db0, %0" :"=r" (val)); break; case 1: asm("mov %%db1, %0" :"=r" (val)); break; case 2: asm("mov %%db2, %0" :"=r" (val)); break; case 3: asm("mov %%db3, %0" :"=r" (val)); break; case 6: asm("mov %%db6, %0" :"=r" (val)); break; case 7: /* * Apply __FORCE_ORDER to DR7 reads to forbid re-ordering them * with other code. * * This is needed because a DR7 access can cause a #VC exception * when running under SEV-ES. Taking a #VC exception is not a * safe thing to do just anywhere in the entry code and * re-ordering might place the access into an unsafe location. * * This happened in the NMI handler, where the DR7 read was * re-ordered to happen before the call to sev_es_ist_enter(), * causing stack recursion. */ asm volatile("mov %%db7, %0" : "=r" (val) : __FORCE_ORDER); break; default: BUG(); } return val; } static __always_inline void native_set_debugreg(int regno, unsigned long value) { switch (regno) { case 0: asm("mov %0, %%db0" ::"r" (value)); break; case 1: asm("mov %0, %%db1" ::"r" (value)); break; case 2: asm("mov %0, %%db2" ::"r" (value)); break; case 3: asm("mov %0, %%db3" ::"r" (value)); break; case 6: asm("mov %0, %%db6" ::"r" (value)); break; case 7: /* * Apply __FORCE_ORDER to DR7 writes to forbid re-ordering them * with other code. * * While is didn't happen with a DR7 write (see the DR7 read * comment above which explains where it happened), add the * __FORCE_ORDER here too to avoid similar problems in the * future. */ asm volatile("mov %0, %%db7" ::"r" (value), __FORCE_ORDER); break; default: BUG(); } } static inline void hw_breakpoint_disable(void) { /* Zero the control register for HW Breakpoint */ set_debugreg(0UL, 7); /* Zero-out the individual HW breakpoint address registers */ set_debugreg(0UL, 0); set_debugreg(0UL, 1); set_debugreg(0UL, 2); set_debugreg(0UL, 3); } static __always_inline bool hw_breakpoint_active(void) { return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; } extern void hw_breakpoint_restore(void); static __always_inline unsigned long local_db_save(void) { unsigned long dr7; if (static_cpu_has(X86_FEATURE_HYPERVISOR) && !hw_breakpoint_active()) return 0; get_debugreg(dr7, 7); dr7 &= ~0x400; /* architecturally set bit */ if (dr7) set_debugreg(0, 7); /* * Ensure the compiler doesn't lower the above statements into * the critical section; disabling breakpoints late would not * be good. */ barrier(); return dr7; } static __always_inline void local_db_restore(unsigned long dr7) { /* * Ensure the compiler doesn't raise this statement into * the critical section; enabling breakpoints early would * not be good. */ barrier(); if (dr7) set_debugreg(dr7, 7); } #ifdef CONFIG_CPU_SUP_AMD extern void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr); extern unsigned long amd_get_dr_addr_mask(unsigned int dr); #else static inline void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) { } static inline unsigned long amd_get_dr_addr_mask(unsigned int dr) { return 0; } #endif #endif /* _ASM_X86_DEBUGREG_H */
988 988 988 988 988 988 10330 10328 10329 3799 10329 53 2889 988 988 988 988 988 988 988 988 988 988 988 988 988 988 988 988 10329 10328 10337 10335 10337 10329 10329 10329 10326 10329 10329 2892 2862 9559 9558 9558 9559 9559 1076 1075 1056 1053 1055 4 16167 15820 15817 15815 15818 15820 55 15834 15971 810 2987 3911 2986 3939 2092 5 5 58 8 58 58 57 45 45 45 45 45 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/file.c * * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes * * Manage the dynamic fd arrays in the process files_struct. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/close_range.h> #include <net/sock.h> #include "internal.h" unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ #define __const_min(x, y) ((x) < (y) ? (x) : (y)) unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void __free_fdtable(struct fdtable *fdt) { kvfree(fdt->fd); kvfree(fdt->open_fds); kfree(fdt); } static void free_fdtable_rcu(struct rcu_head *rcu) { __free_fdtable(container_of(rcu, struct fdtable, rcu)); } #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) /* * Copy 'count' fd bits from the old table to the new table and clear the extra * space if any. This does not copy the file pointers. Called with the files * spinlock held for write. */ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, unsigned int count) { unsigned int cpy, set; cpy = count / BITS_PER_BYTE; set = (nfdt->max_fds - count) / BITS_PER_BYTE; memcpy(nfdt->open_fds, ofdt->open_fds, cpy); memset((char *)nfdt->open_fds + cpy, 0, set); memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); memset((char *)nfdt->close_on_exec + cpy, 0, set); cpy = BITBIT_SIZE(count); set = BITBIT_SIZE(nfdt->max_fds) - cpy; memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy); memset((char *)nfdt->full_fds_bits + cpy, 0, set); } /* * Copy all file descriptors from the old table to the new, expanded table and * clear the extra space. Called with the files spinlock held for write. */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { size_t cpy, set; BUG_ON(nfdt->max_fds < ofdt->max_fds); cpy = ofdt->max_fds * sizeof(struct file *); set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); memcpy(nfdt->fd, ofdt->fd, cpy); memset((char *)nfdt->fd + cpy, 0, set); copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); } /* * Note how the fdtable bitmap allocations very much have to be a multiple of * BITS_PER_LONG. This is not only because we walk those things in chunks of * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". * * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied * by that "1024/sizeof(ptr)" before, we already know there are sufficient * clear low bits. Clang seems to realize that, gcc ends up being confused. * * On a 128-bit machine, the ALIGN() would actually matter. In the meantime, * let's consider it documentation (and maybe a test-case for gcc to improve * its code generation ;) */ static struct fdtable * alloc_fdtable(unsigned int nr) { struct fdtable *fdt; void *data; /* * Figure out how many fds we actually want to support in this fdtable. * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B * and growing in powers of two from there on. */ nr /= (1024 / sizeof(struct file *)); nr = roundup_pow_of_two(nr + 1); nr *= (1024 / sizeof(struct file *)); nr = ALIGN(nr, BITS_PER_LONG); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open * had been set lower between the check in expand_files() and here. Deal * with that in caller, it's cheaper that way. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ if (unlikely(nr > sysctl_nr_open)) nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; data = kvmalloc(max_t(size_t, 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; data += nr / BITS_PER_BYTE; fdt->full_fds_bits = data; return fdt; out_arr: kvfree(fdt->fd); out_fdt: kfree(fdt); out: return NULL; } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. * Return <0 error code on error; 1 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr); /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. */ if (atomic_read(&files->count) > 1) synchronize_rcu(); spin_lock(&files->file_lock); if (!new_fdt) return -ENOMEM; /* * extremely unlikely race - sysctl_nr_open decreased between the check in * caller and alloc_fdtable(). Cheaper to catch it here... */ if (unlikely(new_fdt->max_fds <= nr)) { __free_fdtable(new_fdt); return -EMFILE; } cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in fd_install() */ smp_wmb(); return 1; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. * Return <0 error code on error; 0 when nothing done; 1 when files were * expanded and execution may have blocked. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *fdt; int expanded = 0; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) return expanded; /* Can we expand? */ if (nr >= sysctl_nr_open) return -EMFILE; if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); expanded = 1; wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; } /* All good, so we try */ files->resize_in_progress = true; expanded = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); return expanded; } static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt) { __set_bit(fd, fdt->close_on_exec); } static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt) { if (test_bit(fd, fdt->close_on_exec)) __clear_bit(fd, fdt->close_on_exec); } static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt) { __set_bit(fd, fdt->open_fds); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); } static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits); } static unsigned int count_open_files(struct fdtable *fdt) { unsigned int size = fdt->max_fds; unsigned int i; /* Find the last open fd */ for (i = size / BITS_PER_LONG; i > 0; ) { if (fdt->open_fds[--i]) break; } i = (i + 1) * BITS_PER_LONG; return i; } /* * Note that a sane fdtable size always has to be a multiple of * BITS_PER_LONG, since we have bitmaps that are sized by this. * * 'max_fds' will normally already be properly aligned, but it * turns out that in the close_range() -> __close_range() -> * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end * up having a 'max_fds' value that isn't already aligned. * * Rather than make close_range() have to worry about this, * just make that BITS_PER_LONG alignment be part of a sane * fdtable size. Becuase that's really what it is. */ static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds) { unsigned int count; count = count_open_files(fdt); if (max_fds < NR_OPEN_DEFAULT) max_fds = NR_OPEN_DEFAULT; return ALIGN(min(count, max_fds), BITS_PER_LONG); } /* * Allocate a new files structure and copy contents from the * passed in files structure. * errorp will be valid only when the returned files_struct is NULL. */ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp) { struct files_struct *newf; struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; *errorp = -ENOMEM; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) goto out; atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); newf->resize_in_progress = false; init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, max_fds); /* * Check whether we need to allocate a larger fd array and fd set. */ while (unlikely(open_files > new_fdt->max_fds)) { spin_unlock(&oldf->file_lock); if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); new_fdt = alloc_fdtable(open_files - 1); if (!new_fdt) { *errorp = -ENOMEM; goto out_release; } /* beyond sysctl_nr_open; nothing to do */ if (unlikely(new_fdt->max_fds < open_files)) { __free_fdtable(new_fdt); *errorp = -EMFILE; goto out_release; } /* * Reacquire the oldf lock and a pointer to its fd table * who knows it may have a new bigger fd table. We need * the latest pointer. */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, max_fds); } copy_fd_bitmaps(new_fdt, old_fdt, open_files); old_fds = old_fdt->fd; new_fds = new_fdt->fd; for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; if (f) { get_file(f); } else { /* * The fd may be claimed in the fd bitmap but not yet * instantiated in the files array if a sibling thread * is partway through open(). So make sure that this * fd is available to the new process. */ __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* clear the remainder */ memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); rcu_assign_pointer(newf->fdt, new_fdt); return newf; out_release: kmem_cache_free(files_cachep, newf); out: return NULL; } static struct fdtable *close_files(struct files_struct * files) { /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the * files structure. */ struct fdtable *fdt = rcu_dereference_raw(files->fdt); unsigned int i, j = 0; for (;;) { unsigned long set; i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); if (file) { filp_close(file, files); cond_resched(); } } i++; set >>= 1; } } return fdt; } void put_files_struct(struct files_struct *files) { if (atomic_dec_and_test(&files->count)) { struct fdtable *fdt = close_files(files); /* free the arrays if they are not embedded */ if (fdt != &files->fdtab) __free_fdtable(fdt); kmem_cache_free(files_cachep, files); } } void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; if (files) { task_lock(tsk); tsk->files = NULL; task_unlock(tsk); put_files_struct(files); } } struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, .fdtab = { .max_fds = NR_OPEN_DEFAULT, .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { unsigned int maxfd = fdt->max_fds; unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit > maxfd) return maxfd; if (bitbit > start) start = bitbit; return find_next_zero_bit(fdt->open_fds, maxfd, start); } /* * allocate a file descriptor, mark it busy. */ static int alloc_fd(unsigned start, unsigned end, unsigned flags) { struct files_struct *files = current->files; unsigned int fd; int error; struct fdtable *fdt; spin_lock(&files->file_lock); repeat: fdt = files_fdtable(files); fd = start; if (fd < files->next_fd) fd = files->next_fd; if (fd < fdt->max_fds) fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ error = -EMFILE; if (fd >= end) goto out; error = expand_files(files, fd); if (error < 0) goto out; /* * If we needed to expand the fs array we * might have blocked - try again. */ if (error) goto repeat; if (start <= files->next_fd) files->next_fd = fd + 1; __set_open_fd(fd, fdt); if (flags & O_CLOEXEC) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); error = fd; #if 1 /* Sanity check */ if (rcu_access_pointer(fdt->fd[fd]) != NULL) { printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); rcu_assign_pointer(fdt->fd[fd], NULL); } #endif out: spin_unlock(&files->file_lock); return error; } int __get_unused_fd_flags(unsigned flags, unsigned long nofile) { return alloc_fd(0, nofile, flags); } int get_unused_fd_flags(unsigned flags) { return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); } EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; } void put_unused_fd(unsigned int fd) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); } EXPORT_SYMBOL(put_unused_fd); /* * Install a file pointer in the fd array. * * The VFS is full of places where we drop the files lock between * setting the open_fds bitmap and installing the file in the file * array. At any such point, we are vulnerable to a dup2() race * installing a file in the array before us. We need to detect this and * fput() the struct file we are about to overwrite in this case. * * It should never happen - if we allow dup2() do it, _really_ bad things * will follow. * * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; struct fdtable *fdt; rcu_read_lock_sched(); if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } EXPORT_SYMBOL(fd_install); /** * pick_file - return file associatd with fd * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * * Context: files_lock must be held. * * Returns: The file associated with @fd (NULL if @fd is not open) */ static struct file *pick_file(struct files_struct *files, unsigned fd) { struct fdtable *fdt = files_fdtable(files); struct file *file; if (fd >= fdt->max_fds) return NULL; fd = array_index_nospec(fd, fdt->max_fds); file = fdt->fd[fd]; if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); } return file; } int close_fd(unsigned fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = pick_file(files, fd); spin_unlock(&files->file_lock); if (!file) return -EBADF; return filp_close(file, files); } EXPORT_SYMBOL(close_fd); /* for ksys_close() */ /** * last_fd - return last valid index into fd table * @fdt: File descriptor table. * * Context: Either rcu read lock or files_lock must be held. * * Returns: Last valid index into fdtable. */ static inline unsigned last_fd(struct fdtable *fdt) { return fdt->max_fds - 1; } static inline void __range_cloexec(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { struct fdtable *fdt; /* make sure we're using the correct maximum value */ spin_lock(&cur_fds->file_lock); fdt = files_fdtable(cur_fds); max_fd = min(last_fd(fdt), max_fd); if (fd <= max_fd) bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); spin_unlock(&cur_fds->file_lock); } static inline void __range_close(struct files_struct *files, unsigned int fd, unsigned int max_fd) { struct file *file; unsigned n; spin_lock(&files->file_lock); n = last_fd(files_fdtable(files)); max_fd = min(max_fd, n); for (; fd <= max_fd; fd++) { file = pick_file(files, fd); if (file) { spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } else if (need_resched()) { spin_unlock(&files->file_lock); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } /** * __close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close * @flags: CLOSE_RANGE flags. * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. */ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) { struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) return -EINVAL; if (fd > max_fd) return -EINVAL; if (flags & CLOSE_RANGE_UNSHARE) { int ret; unsigned int max_unshare_fds = NR_OPEN_MAX; /* * If the caller requested all fds to be made cloexec we always * copy all of the file descriptors since they still want to * use them. */ if (!(flags & CLOSE_RANGE_CLOEXEC)) { /* * If the requested range is greater than the current * maximum, we're closing everything so only copy all * file descriptors beneath the lowest file descriptor. */ rcu_read_lock(); if (max_fd >= last_fd(files_fdtable(cur_fds))) max_unshare_fds = fd; rcu_read_unlock(); } ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds); if (ret) return ret; /* * We used to share our file descriptor table, and have now * created a private one, make sure we're using it below. */ if (fds) swap(cur_fds, fds); } if (flags & CLOSE_RANGE_CLOEXEC) __range_cloexec(cur_fds, fd, max_fd); else __range_close(cur_fds, fd, max_fd); if (fds) { /* * We're done closing the files we were supposed to. Time to install * the new file descriptor table and drop the old one. */ task_lock(me); me->files = cur_fds; task_unlock(me); put_files_struct(fds); } return 0; } /* * See close_fd_get_file() below, this variant assumes current->files->file_lock * is held. */ struct file *__close_fd_get_file(unsigned int fd) { return pick_file(current->files, fd); } /* * variant of close_fd that gets a ref on the file for later fput. * The caller must ensure that filp_close() called on the file. */ struct file *close_fd_get_file(unsigned int fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = pick_file(files, fd); spin_unlock(&files->file_lock); return file; } void do_close_on_exec(struct files_struct *files) { unsigned i; struct fdtable *fdt; /* exec unshares first */ spin_lock(&files->file_lock); for (i = 0; ; i++) { unsigned long set; unsigned fd = i * BITS_PER_LONG; fdt = files_fdtable(files); if (fd >= fdt->max_fds) break; set = fdt->close_on_exec[i]; if (!set) continue; fdt->close_on_exec[i] = 0; for ( ; set ; fd++, set >>= 1) { struct file *file; if (!(set & 1)) continue; file = fdt->fd[fd]; if (!file) continue; rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } static inline struct file *__fget_files_rcu(struct files_struct *files, unsigned int fd, fmode_t mask) { for (;;) { struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; if (unlikely(fd >= fdt->max_fds)) return NULL; fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); file = rcu_dereference_raw(*fdentry); if (unlikely(!file)) return NULL; if (unlikely(file->f_mode & mask)) return NULL; /* * Ok, we have a file pointer. However, because we do * this all locklessly under RCU, we may be racing with * that file being closed. * * Such a race can take two forms: * * (a) the file ref already went down to zero, * and get_file_rcu() fails. Just try again: */ if (unlikely(!get_file_rcu(file))) continue; /* * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes * hand-in-hand with 'fdt'. * * If so, we need to put our ref and try again. */ if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || unlikely(rcu_dereference_raw(*fdentry) != file)) { fput(file); continue; } /* * Ok, we have a ref to the file, and checked that it * still exists. */ return file; } } static struct file *__fget_files(struct files_struct *files, unsigned int fd, fmode_t mask) { struct file *file; rcu_read_lock(); file = __fget_files_rcu(files, fd, mask); rcu_read_unlock(); return file; } static inline struct file *__fget(unsigned int fd, fmode_t mask) { return __fget_files(current->files, fd, mask); } struct file *fget(unsigned int fd) { return __fget(fd, FMODE_PATH); } EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { return __fget(fd, 0); } EXPORT_SYMBOL(fget_raw); struct file *fget_task(struct task_struct *task, unsigned int fd) { struct file *file = NULL; task_lock(task); if (task->files) file = __fget_files(task->files, fd, 0); task_unlock(task); return file; } struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; struct file *file = NULL; task_lock(task); files = task->files; if (files) file = files_lookup_fd_rcu(files, fd); task_unlock(task); return file; } struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; unsigned int fd = *ret_fd; struct file *file = NULL; task_lock(task); files = task->files; if (files) { for (; fd < files_fdtable(files)->max_fds; fd++) { file = files_lookup_fd_rcu(files, fd); if (file) break; } } task_unlock(task); *ret_fd = fd; return file; } EXPORT_SYMBOL(task_lookup_next_fd_rcu); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * * You can use this instead of fget if you satisfy all of the following * conditions: * 1) You must call fput_light before exiting the syscall and returning control * to userspace (i.e. you cannot remember the returned struct file * after * returning to userspace). * 2) You must not call filp_close on the returned struct file * in between * calls to fget_light and fput_light. * 3) You must not clone the current task in between the calls to fget_light * and fput_light. * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. */ static unsigned long __fget_light(unsigned int fd, fmode_t mask) { struct files_struct *files = current->files; struct file *file; /* * If another thread is concurrently calling close_fd() followed * by put_files_struct(), we must not observe the old table * entry combined with the new refcount - otherwise we could * return a file that is concurrently being freed. * * atomic_read_acquire() pairs with atomic_dec_and_test() in * put_files_struct(). */ if (atomic_read_acquire(&files->count) == 1) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return 0; return (unsigned long)file; } else { file = __fget(fd, mask); if (!file) return 0; return FDPUT_FPUT | (unsigned long)file; } } unsigned long __fdget(unsigned int fd) { return __fget_light(fd, FMODE_PATH); } EXPORT_SYMBOL(__fdget); unsigned long __fdget_raw(unsigned int fd) { return __fget_light(fd, 0); } /* * Try to avoid f_pos locking. We only need it if the * file is marked for FMODE_ATOMIC_POS, and it can be * accessed multiple ways. * * Always do it for directories, because pidfd_getfd() * can make a file accessible even if it otherwise would * not be, and for directories this is a correctness * issue, not a "POSIX requirement". */ static inline bool file_needs_f_pos_lock(struct file *file) { return (file->f_mode & FMODE_ATOMIC_POS) && (file_count(file) > 1 || file->f_op->iterate_shared); } unsigned long __fdget_pos(unsigned int fd) { unsigned long v = __fdget(fd); struct file *file = (struct file *)(v & ~3); if (file && file_needs_f_pos_lock(file)) { v |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } return v; } void __f_unlock_pos(struct file *f) { mutex_unlock(&f->f_pos_lock); } /* * We only lock f_pos if we have threads or if the file might be * shared with another process. In both cases we'll have an elevated * file count (done either by fdget() or by fork()). */ void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; struct fdtable *fdt; spin_lock(&files->file_lock); fdt = files_fdtable(files); if (flag) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); } bool get_close_on_exec(unsigned int fd) { struct files_struct *files = current->files; struct fdtable *fdt; bool res; rcu_read_lock(); fdt = files_fdtable(files); res = close_on_exec(fd, fdt); rcu_read_unlock(); return res; } static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) __releases(&files->file_lock) { struct file *tofree; struct fdtable *fdt; /* * We need to detect attempts to do dup2() over allocated but still * not finished descriptor. NB: OpenBSD avoids that at the price of * extra work in their equivalent of fget() - they insert struct * file immediately after grabbing descriptor, mark it larval if * more work (e.g. actual opening) is needed and make sure that * fget() treats larval files as absent. Potentially interesting, * but while extra work in fget() is trivial, locking implications * and amount of surgery on open()-related paths in VFS are not. * FreeBSD fails with -EBADF in the same situation, NetBSD "solution" * deadlocks in rather amusing ways, AFAICS. All of that is out of * scope of POSIX or SUS, since neither considers shared descriptor * tables and this condition does not arise without those. */ fdt = files_fdtable(files); tofree = fdt->fd[fd]; if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt); if (flags & O_CLOEXEC) __set_close_on_exec(fd, fdt); else __clear_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); if (tofree) filp_close(tofree, files); return fd; Ebusy: spin_unlock(&files->file_lock); return -EBUSY; } int replace_fd(unsigned fd, struct file *file, unsigned flags) { int err; struct files_struct *files = current->files; if (!file) return close_fd(fd); if (fd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; return do_dup2(files, file, fd, flags); out_unlock: spin_unlock(&files->file_lock); return err; } /** * __receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry * * Installs a received file into the file descriptor table, with appropriate * checks and count updates. Optionally writes the fd number to userspace, if * @ufd is non-NULL. * * This helper handles its own reference counting of the incoming * struct file. * * Returns newly install fd or -ve on error. */ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; error = security_file_receive(file); if (error) return error; new_fd = get_unused_fd_flags(o_flags); if (new_fd < 0) return new_fd; if (ufd) { error = put_user(new_fd, ufd); if (error) { put_unused_fd(new_fd); return error; } } fd_install(new_fd, get_file(file)); __receive_sock(file); return new_fd; } int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { int error; error = security_file_receive(file); if (error) return error; error = replace_fd(new_fd, file, o_flags); if (error) return error; __receive_sock(file); return new_fd; } int receive_fd(struct file *file, unsigned int o_flags) { return __receive_fd(file, NULL, o_flags); } EXPORT_SYMBOL_GPL(receive_fd); static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file *file; struct files_struct *files = current->files; if ((flags & ~O_CLOEXEC) != 0) return -EINVAL; if (unlikely(oldfd == newfd)) return -EINVAL; if (newfd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, newfd); file = files_lookup_fd_locked(files, oldfd); if (unlikely(!file)) goto Ebadf; if (unlikely(err < 0)) { if (err == -EMFILE) goto Ebadf; goto out_unlock; } return do_dup2(files, file, newfd, flags); Ebadf: err = -EBADF; out_unlock: spin_unlock(&files->file_lock); return err; } SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) { return ksys_dup3(oldfd, newfd, flags); } SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; int retval = oldfd; rcu_read_lock(); if (!files_lookup_fd_rcu(files, oldfd)) retval = -EBADF; rcu_read_unlock(); return retval; } return ksys_dup3(oldfd, newfd, 0); } SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd_flags(0); if (ret >= 0) fd_install(ret, file); else fput(file); } return ret; } int f_dupfd(unsigned int from, struct file *file, unsigned flags) { unsigned long nofile = rlimit(RLIMIT_NOFILE); int err; if (from >= nofile) return -EINVAL; err = alloc_fd(from, nofile, flags); if (err >= 0) { get_file(file); fd_install(err, file); } return err; } int iterate_fd(struct files_struct *files, unsigned n, int (*f)(const void *, struct file *, unsigned), const void *p) { struct fdtable *fdt; int res = 0; if (!files) return 0; spin_lock(&files->file_lock); for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { struct file *file; file = rcu_dereference_check_fdtable(files, fdt->fd[n]); if (!file) continue; res = f(p, file, n); if (res) break; } spin_unlock(&files->file_lock); return res; } EXPORT_SYMBOL(iterate_fd);
11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 // SPDX-License-Identifier: GPL-2.0-or-later /* Socket buffer accounting * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" #define select_skb_count(skb) (&rxrpc_n_rx_skbs) /* * Note the allocation or reception of a socket buffer. */ void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); } /* * Note the re-emergence of a socket buffer from a queue or buffer. */ void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { if (skb) { int n = atomic_read(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); } } /* * Note the addition of a ref on a socket buffer. */ void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); skb_get(skb); } /* * Note the dropping of a ref on a socket buffer by the core. */ void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { int n = atomic_inc_return(&rxrpc_n_rx_skbs); trace_rxrpc_skb(skb, 0, n, why); } /* * Note the destruction of a socket buffer. */ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { if (skb) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); consume_skb(skb); } } /* * Clear a queue of socket buffers. */ void rxrpc_purge_queue(struct sk_buff_head *list) { struct sk_buff *skb; while ((skb = skb_dequeue((list))) != NULL) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, refcount_read(&skb->users), n, rxrpc_skb_put_purge); consume_skb(skb); } }
7076 275 7239 1 53 784 10 10 1 3778 447 2689 407 220 54 774 10 35 117 2672 4501 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 /* SPDX-License-Identifier: GPL-2.0 */ /* * Linux Socket Filter Data Structures */ #ifndef __LINUX_FILTER_H__ #define __LINUX_FILTER_H__ #include <linux/atomic.h> #include <linux/bpf.h> #include <linux/refcount.h> #include <linux/compat.h> #include <linux/skbuff.h> #include <linux/linkage.h> #include <linux/printk.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/capability.h> #include <linux/set_memory.h> #include <linux/kallsyms.h> #include <linux/if_vlan.h> #include <linux/vmalloc.h> #include <linux/sockptr.h> #include <crypto/sha1.h> #include <linux/u64_stats_sync.h> #include <net/sch_generic.h> #include <asm/byteorder.h> #include <uapi/linux/filter.h> struct sk_buff; struct sock; struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; struct sock_reuseport; struct ctl_table; struct ctl_table_header; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function * calls in BPF_CALL instruction. */ #define BPF_REG_ARG1 BPF_REG_1 #define BPF_REG_ARG2 BPF_REG_2 #define BPF_REG_ARG3 BPF_REG_3 #define BPF_REG_ARG4 BPF_REG_4 #define BPF_REG_ARG5 BPF_REG_5 #define BPF_REG_CTX BPF_REG_6 #define BPF_REG_FP BPF_REG_10 /* Additional register mappings for converted user programs. */ #define BPF_REG_A BPF_REG_0 #define BPF_REG_X BPF_REG_7 #define BPF_REG_TMP BPF_REG_2 /* scratch reg */ #define BPF_REG_D BPF_REG_8 /* data, callee-saved */ #define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ /* Kernel hidden auxiliary/helper register. */ #define BPF_REG_AX MAX_BPF_REG #define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) #define MAX_BPF_JIT_REG MAX_BPF_EXT_REG /* unused opcode to mark special call to bpf_tail_call() helper */ #define BPF_TAIL_CALL 0xf0 /* unused opcode to mark special load instruction. Same as BPF_ABS */ #define BPF_PROBE_MEM 0x20 /* unused opcode to mark special ldsx instruction. Same as BPF_IND */ #define BPF_PROBE_MEMSX 0x40 /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 /* unused opcode to mark speculation barrier for mitigating * Speculative Store Bypass */ #define BPF_NOSPEC 0xc0 /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. */ #define BPF_SYM_ELF_TYPE 't' /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 /* Helper macros for filter block array initializers. */ /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ #define BPF_ALU64_REG_OFF(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_ALU64_REG(OP, DST, SRC) \ BPF_ALU64_REG_OFF(OP, DST, SRC, 0) #define BPF_ALU32_REG_OFF(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_ALU32_REG(OP, DST, SRC) \ BPF_ALU32_REG_OFF(OP, DST, SRC, 0) /* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ #define BPF_ALU64_IMM_OFF(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) #define BPF_ALU64_IMM(OP, DST, IMM) \ BPF_ALU64_IMM_OFF(OP, DST, IMM, 0) #define BPF_ALU32_IMM_OFF(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) #define BPF_ALU32_IMM(OP, DST, IMM) \ BPF_ALU32_IMM_OFF(OP, DST, IMM, 0) /* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */ #define BPF_ENDIAN(TYPE, DST, LEN) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_END | BPF_SRC(TYPE), \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = LEN }) /* Byte Swap, bswap16/32/64 */ #define BPF_BSWAP(DST, LEN) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_END | BPF_SRC(BPF_TO_LE), \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = LEN }) /* Short form of mov, dst_reg = src_reg */ #define BPF_MOV64_REG(DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) #define BPF_MOV32_REG(DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) /* Short form of mov, dst_reg = imm32 */ #define BPF_MOV64_IMM(DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) #define BPF_MOV32_IMM(DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) /* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */ #define BPF_MOVSX64_REG(DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_MOVSX32_REG(DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Special form of mov32, used for doing explicit zero extension on dst. */ #define BPF_ZEXT_REG(DST) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = DST, \ .off = 0, \ .imm = 1 }) static inline bool insn_is_zext(const struct bpf_insn *insn) { return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1; } /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ #define BPF_LD_IMM64(DST, IMM) \ BPF_LD_IMM64_RAW(DST, 0, IMM) #define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_DW | BPF_IMM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = (__u32) (IMM) }), \ ((struct bpf_insn) { \ .code = 0, /* zero is reserved opcode */ \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = ((__u64) (IMM)) >> 32 }) /* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ #define BPF_LD_MAP_FD(DST, MAP_FD) \ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_SRC(TYPE), \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = IMM }) #define BPF_MOV32_RAW(TYPE, DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_SRC(TYPE), \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = IMM }) /* Direct packet access, R0 = *(uint *) (skb->data + imm32) */ #define BPF_LD_ABS(SIZE, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) /* Indirect packet access, R0 = *(uint *) (skb->data + src_reg + imm32) */ #define BPF_LD_IND(SIZE, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_IND, \ .dst_reg = 0, \ .src_reg = SRC, \ .off = 0, \ .imm = IMM }) /* Memory load, dst_reg = *(uint *) (src_reg + off16) */ #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Memory load, dst_reg = *(signed size *) (src_reg + off16) */ #define BPF_LDX_MEMSX(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEMSX, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Memory store, *(uint *) (dst_reg + off16) = src_reg */ #define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg * BPF_AND *(uint *) (dst_reg + off16) &= src_reg * BPF_OR *(uint *) (dst_reg + off16) |= src_reg * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = OP }) /* Legacy alias */ #define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ #define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ ((struct bpf_insn) { \ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) /* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ #define BPF_JMP_REG(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ #define BPF_JMP_IMM(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) /* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ #define BPF_JMP32_REG(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ #define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) /* Unconditional jumps, goto pc + off16 */ #define BPF_JMP_A(OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_JA, \ .dst_reg = 0, \ .src_reg = 0, \ .off = OFF, \ .imm = 0 }) /* Relative call */ #define BPF_CALL_REL(TGT) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ .dst_reg = 0, \ .src_reg = BPF_PSEUDO_CALL, \ .off = 0, \ .imm = TGT }) /* Convert function address to BPF immediate */ #define BPF_CALL_IMM(x) ((void *)(x) - (void *)__bpf_call_base) #define BPF_EMIT_CALL(FUNC) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = BPF_CALL_IMM(FUNC) }) /* Raw code statement block */ #define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ ((struct bpf_insn) { \ .code = CODE, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = IMM }) /* Program exit */ #define BPF_EXIT_INSN() \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_EXIT, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = 0 }) /* Speculation barrier */ #define BPF_ST_NOSPEC() \ ((struct bpf_insn) { \ .code = BPF_ST | BPF_NOSPEC, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = 0 }) /* Internal classic blocks for direct assignment */ #define __BPF_STMT(CODE, K) \ ((struct sock_filter) BPF_STMT(CODE, K)) #define __BPF_JUMP(CODE, K, JT, JF) \ ((struct sock_filter) BPF_JUMP(CODE, K, JT, JF)) #define bytes_to_bpf_size(bytes) \ ({ \ int bpf_size = -EINVAL; \ \ if (bytes == sizeof(u8)) \ bpf_size = BPF_B; \ else if (bytes == sizeof(u16)) \ bpf_size = BPF_H; \ else if (bytes == sizeof(u32)) \ bpf_size = BPF_W; \ else if (bytes == sizeof(u64)) \ bpf_size = BPF_DW; \ \ bpf_size; \ }) #define bpf_size_to_bytes(bpf_size) \ ({ \ int bytes = -EINVAL; \ \ if (bpf_size == BPF_B) \ bytes = sizeof(u8); \ else if (bpf_size == BPF_H) \ bytes = sizeof(u16); \ else if (bpf_size == BPF_W) \ bytes = sizeof(u32); \ else if (bpf_size == BPF_DW) \ bytes = sizeof(u64); \ \ bytes; \ }) #define BPF_SIZEOF(type) \ ({ \ const int __size = bytes_to_bpf_size(sizeof(type)); \ BUILD_BUG_ON(__size < 0); \ __size; \ }) #define BPF_FIELD_SIZEOF(type, field) \ ({ \ const int __size = bytes_to_bpf_size(sizeof_field(type, field)); \ BUILD_BUG_ON(__size < 0); \ __size; \ }) #define BPF_LDST_BYTES(insn) \ ({ \ const int __size = bpf_size_to_bytes(BPF_SIZE((insn)->code)); \ WARN_ON(__size < 0); \ __size; \ }) #define __BPF_MAP_0(m, v, ...) v #define __BPF_MAP_1(m, v, t, a, ...) m(t, a) #define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__) #define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__) #define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__) #define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__) #define __BPF_REG_0(...) __BPF_PAD(5) #define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4) #define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3) #define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2) #define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1) #define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__) #define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__) #define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__) #define __BPF_CAST(t, a) \ (__force t) \ (__force \ typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long), \ (unsigned long)0, (t)0))) a #define __BPF_V void #define __BPF_N #define __BPF_DECL_ARGS(t, a) t a #define __BPF_DECL_REGS(t, a) u64 a #define __BPF_PAD(n) \ __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2, \ u64, __ur_3, u64, __ur_4, u64, __ur_5) #define BPF_CALL_x(x, name, ...) \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ { \ return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ } \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) #define BPF_CALL_0(name, ...) BPF_CALL_x(0, name, __VA_ARGS__) #define BPF_CALL_1(name, ...) BPF_CALL_x(1, name, __VA_ARGS__) #define BPF_CALL_2(name, ...) BPF_CALL_x(2, name, __VA_ARGS__) #define BPF_CALL_3(name, ...) BPF_CALL_x(3, name, __VA_ARGS__) #define BPF_CALL_4(name, ...) BPF_CALL_x(4, name, __VA_ARGS__) #define BPF_CALL_5(name, ...) BPF_CALL_x(5, name, __VA_ARGS__) #define bpf_ctx_range(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 #define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2) \ offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1 #if BITS_PER_LONG == 64 # define bpf_ctx_range_ptr(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 #else # define bpf_ctx_range_ptr(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1 #endif /* BITS_PER_LONG == 64 */ #define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE) \ ({ \ BUILD_BUG_ON(sizeof_field(TYPE, MEMBER) != (SIZE)); \ *(PTR_SIZE) = (SIZE); \ offsetof(TYPE, MEMBER); \ }) /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { u16 len; compat_uptr_t filter; /* struct sock_filter * */ }; struct sock_fprog_kern { u16 len; struct sock_filter *filter; }; /* Some arches need doubleword alignment for their instructions and/or data */ #define BPF_IMAGE_ALIGNMENT 8 struct bpf_binary_header { u32 size; u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; struct bpf_prog_stats { u64_stats_t cnt; u64_stats_t nsecs; u64_stats_t misses; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); struct sk_filter { refcount_t refcnt; struct rcu_head rcu; struct bpf_prog *prog; }; DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); extern struct mutex nf_conn_btf_access_lock; extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size); typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, unsigned int (*bpf_func)(const void *, const struct bpf_insn *)); static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, const void *ctx, bpf_dispatcher_fn dfunc) { u32 ret; cant_migrate(); if (static_branch_unlikely(&bpf_stats_enabled_key)) { struct bpf_prog_stats *stats; u64 start = sched_clock(); unsigned long flags; ret = dfunc(ctx, prog->insnsi, prog->bpf_func); stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); u64_stats_add(&stats->nsecs, sched_clock() - start); u64_stats_update_end_irqrestore(&stats->syncp, flags); } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); } return ret; } static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx) { return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func); } /* * Use in preemptible and therefore migratable context to make sure that * the execution of the BPF program runs on one CPU. * * This uses migrate_disable/enable() explicitly to document that the * invocation of a BPF program does not require reentrancy protection * against a BPF program which is invoked from a preempting task. */ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog, const void *ctx) { u32 ret; migrate_disable(); ret = bpf_prog_run(prog, ctx); migrate_enable(); return ret; } #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; void *data_meta; void *data_end; }; struct bpf_nh_params { u32 nh_family; union { u32 ipv4_nh; struct in6_addr ipv6_nh; }; }; struct bpf_redirect_info { u64 tgt_index; void *tgt_value; struct bpf_map *map; u32 flags; u32 kern_flags; u32 map_id; enum bpf_map_type map_type; struct bpf_nh_params nh; }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); /* flags for bpf_redirect_info kern_flags */ #define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) * ensure that cb[] area can be written to when BPF program is * invoked (otherwise cb[] save/restore is necessary). */ static inline void bpf_compute_data_pointers(struct sk_buff *skb) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); cb->data_meta = skb->data - skb_metadata_len(skb); cb->data_end = skb->data + skb_headlen(skb); } /* Similar to bpf_compute_data_pointers(), except that save orginal * data in cb->data and cb->meta_data for restore. */ static inline void bpf_compute_and_save_data_end( struct sk_buff *skb, void **saved_data_end) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; *saved_data_end = cb->data_end; cb->data_end = skb->data + skb_headlen(skb); } /* Restore data saved by bpf_compute_data_pointers(). */ static inline void bpf_restore_data_end( struct sk_buff *skb, void *saved_data_end) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; cb->data_end = saved_data_end; } static inline u8 *bpf_skb_cb(const struct sk_buff *skb) { /* eBPF programs may read/write skb->cb[] area to transfer meta * data between tail calls. Since this also needs to work with * tc, that scratch memory is mapped to qdisc_skb_cb's data area. * * In some socket filter cases, the cb unfortunately needs to be * saved/restored so that protocol specific skb->cb[] data won't * be lost. In any case, due to unpriviledged eBPF programs * attached to sockets, we need to clear the bpf_skb_cb() area * to not leak previous contents to user space. */ BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != BPF_SKB_CB_LEN); BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != sizeof_field(struct qdisc_skb_cb, data)); return qdisc_skb_cb(skb)->data; } /* Must be invoked with migration disabled */ static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, const void *ctx) { const struct sk_buff *skb = ctx; u8 *cb_data = bpf_skb_cb(skb); u8 cb_saved[BPF_SKB_CB_LEN]; u32 res; if (unlikely(prog->cb_access)) { memcpy(cb_saved, cb_data, sizeof(cb_saved)); memset(cb_data, 0, sizeof(cb_saved)); } res = bpf_prog_run(prog, skb); if (unlikely(prog->cb_access)) memcpy(cb_data, cb_saved, sizeof(cb_saved)); return res; } static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, struct sk_buff *skb) { u32 res; migrate_disable(); res = __bpf_prog_run_save_cb(prog, skb); migrate_enable(); return res; } static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, struct sk_buff *skb) { u8 *cb_data = bpf_skb_cb(skb); u32 res; if (unlikely(prog->cb_access)) memset(cb_data, 0, BPF_SKB_CB_LEN); res = bpf_prog_run_pin_on_cpu(prog, skb); return res; } DECLARE_BPF_DISPATCHER(xdp) DECLARE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); u32 xdp_master_redirect(struct xdp_buff *xdp); void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog); static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) { return prog->len * sizeof(struct bpf_insn); } static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog) { return round_up(bpf_prog_insn_size(prog) + sizeof(__be64) + 1, SHA1_BLOCK_SIZE); } static inline unsigned int bpf_prog_size(unsigned int proglen) { return max(sizeof(struct bpf_prog), offsetof(struct bpf_prog, insns[proglen])); } static inline bool bpf_prog_was_classic(const struct bpf_prog *prog) { /* When classic BPF programs have been loaded and the arch * does not have a classic BPF JIT (anymore), they have been * converted via bpf_migrate_filter() to eBPF and thus always * have an unspec program type. */ return prog->type == BPF_PROG_TYPE_UNSPEC; } static inline u32 bpf_ctx_off_adjust_machine(u32 size) { const u32 size_machine = sizeof(unsigned long); if (size > size_machine && size % size_machine == 0) size = size_machine; return size; } static inline bool bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) { return size <= size_default && (size & (size - 1)) == 0; } static inline u8 bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default) { u8 access_off = off & (size_default - 1); #ifdef __LITTLE_ENDIAN return access_off; #else return size_default - (access_off + size); #endif } #define bpf_ctx_wide_access_ok(off, size, type, field) \ (size == sizeof(__u64) && \ off >= offsetof(type, field) && \ off + sizeof(__u64) <= offsetofend(type, field) && \ off % sizeof(__u64) == 0) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) static inline void bpf_prog_lock_ro(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON if (!fp->jited) { set_vm_flush_reset_perms(fp); set_memory_ro((unsigned long)fp, fp->pages); } #endif } static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { set_vm_flush_reset_perms(hdr); set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { return sk_filter_trim_cap(sk, skb, 1); } struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); bool bpf_opcode_in_insntable(u8 code); void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, const u32 *insn_to_jit_off); int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog); void bpf_prog_jit_attempt_done(struct bpf_prog *prog); struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); static inline void bpf_prog_unlock_free(struct bpf_prog *fp) { __bpf_prog_free(fp); } typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter, unsigned int flen); int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig); void bpf_prog_destroy(struct bpf_prog *fp); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len); bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #define __bpf_call_base_args \ ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ (void *)__bpf_call_base) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_kfunc_call(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); static inline bool bpf_dump_raw_ok(const struct cred *cred) { /* Reconstruction of call-sites is dependent on kallsyms, * thus make dump the same restriction. */ return kallsyms_show_value(cred); } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt); void bpf_clear_redirect_map(struct bpf_map *map); static inline bool xdp_return_frame_no_direct(void) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; } static inline void xdp_set_return_frame_no_direct(void) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; } static inline void xdp_clear_return_frame_no_direct(void) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; } static inline int xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen) { unsigned int len; if (unlikely(!(fwd->flags & IFF_UP))) return -ENETDOWN; len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; if (pktlen > len) return -EMSGSIZE; return 0; } /* The pair of xdp_do_redirect and xdp_do_flush MUST be called in the * same cpu context. Further for best results no more than a single map * for the do_redirect/do_flush pair should be used. This limitation is * because we only track one map and force a flush when the map changes. * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, struct bpf_prog *prog); void xdp_do_flush(void); /* The xdp_do_flush_map() helper has been renamed to drop the _map suffix, as * it is no longer only flushing maps. Keep this define for compatibility * until all drivers are updated - do not use xdp_do_flush_map() in new code! */ #define xdp_do_flush_map xdp_do_flush void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, struct sock *migrating_sk, u32 hash); #else static inline struct sock * bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, struct sock *migrating_sk, u32 hash) { return NULL; } #endif #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; extern int bpf_jit_kallsyms; extern long bpf_jit_limit; extern long bpf_jit_limit_max; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); void bpf_jit_fill_hole_with_zero(void *area, unsigned int size); struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, bpf_jit_fill_hole_t bpf_fill_ill_insns); void bpf_jit_binary_free(struct bpf_binary_header *hdr); u64 bpf_jit_alloc_exec_limit(void); void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); void bpf_prog_pack_free(struct bpf_binary_header *hdr); static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { return list_empty(&fp->aux->ksym.lnode) || fp->aux->ksym.lnode.prev == LIST_POISON2; } struct bpf_binary_header * bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image, unsigned int alignment, struct bpf_binary_header **rw_hdr, u8 **rw_image, bpf_jit_fill_hole_t bpf_fill_ill_insns); int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header); void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header); int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke); int bpf_jit_get_func_addr(const struct bpf_prog *prog, const struct bpf_insn *insn, bool extra_pass, u64 *func_addr, bool *func_addr_fixed); struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, u32 pass, void *image) { pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen, proglen, pass, image, current->comm, task_pid_nr(current)); if (image) print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, 16, 1, image, proglen, false); } static inline bool bpf_jit_is_ebpf(void) { # ifdef CONFIG_HAVE_EBPF_JIT return true; # else return false; # endif } static inline bool ebpf_jit_enabled(void) { return bpf_jit_enable && bpf_jit_is_ebpf(); } static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) { return fp->jited && bpf_jit_is_ebpf(); } static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) { /* These are the prerequisites, should someone ever have the * idea to call blinding outside of them, we make sure to * bail out. */ if (!bpf_jit_is_ebpf()) return false; if (!prog->jit_requested) return false; if (!bpf_jit_harden) return false; if (bpf_jit_harden == 1 && bpf_capable()) return false; return true; } static inline bool bpf_jit_kallsyms_enabled(void) { /* There are a couple of corner cases where kallsyms should * not be enabled f.e. on hardening. */ if (bpf_jit_harden) return false; if (!bpf_jit_kallsyms) return false; if (bpf_jit_kallsyms == 1) return true; return false; } const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym); bool is_bpf_text_address(unsigned long addr); int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym); struct bpf_prog *bpf_prog_ksym_find(unsigned long addr); static inline const char * bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { const char *ret = __bpf_address_lookup(addr, size, off, sym); if (ret && modname) *modname = NULL; return ret; } void bpf_prog_kallsyms_add(struct bpf_prog *fp); void bpf_prog_kallsyms_del(struct bpf_prog *fp); #else /* CONFIG_BPF_JIT */ static inline bool ebpf_jit_enabled(void) { return false; } static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) { return false; } static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) { return false; } static inline int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke) { return -ENOTSUPP; } static inline void bpf_jit_free(struct bpf_prog *fp) { bpf_prog_unlock_free(fp); } static inline bool bpf_jit_kallsyms_enabled(void) { return false; } static inline const char * __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { return NULL; } static inline bool is_bpf_text_address(unsigned long addr) { return false; } static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { return -ERANGE; } static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) { return NULL; } static inline const char * bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { return NULL; } static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) { } static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) { } #endif /* CONFIG_BPF_JIT */ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); #define BPF_ANC BIT(15) static inline bool bpf_needs_clear_a(const struct sock_filter *first) { switch (first->code) { case BPF_RET | BPF_K: case BPF_LD | BPF_W | BPF_LEN: return false; case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: if (first->k == SKF_AD_OFF + SKF_AD_ALU_XOR_X) return true; return false; default: return true; } } static inline u16 bpf_anc_helper(const struct sock_filter *ftest) { BUG_ON(ftest->code & BPF_ANC); switch (ftest->code) { case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: #define BPF_ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \ return BPF_ANC | SKF_AD_##CODE switch (ftest->k) { BPF_ANCILLARY(PROTOCOL); BPF_ANCILLARY(PKTTYPE); BPF_ANCILLARY(IFINDEX); BPF_ANCILLARY(NLATTR); BPF_ANCILLARY(NLATTR_NEST); BPF_ANCILLARY(MARK); BPF_ANCILLARY(QUEUE); BPF_ANCILLARY(HATYPE); BPF_ANCILLARY(RXHASH); BPF_ANCILLARY(CPU); BPF_ANCILLARY(ALU_XOR_X); BPF_ANCILLARY(VLAN_TAG); BPF_ANCILLARY(VLAN_TAG_PRESENT); BPF_ANCILLARY(PAY_OFFSET); BPF_ANCILLARY(RANDOM); BPF_ANCILLARY(VLAN_TPID); } fallthrough; default: return ftest->code; } } void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size); static inline int bpf_tell_extensions(void) { return SKF_AD_MAX; } struct bpf_sock_addr_kern { struct sock *sk; struct sockaddr *uaddr; /* Temporary "register" to make indirect stores to nested structures * defined above. We need three registers to make such a store, but * only two (src and dst) are available at convert_ctx_access time */ u64 tmp_reg; void *t_ctx; /* Attach type specific context. */ }; struct bpf_sock_ops_kern { struct sock *sk; union { u32 args[4]; u32 reply; u32 replylong[4]; }; struct sk_buff *syn_skb; struct sk_buff *skb; void *skb_data_end; u8 op; u8 is_fullsock; u8 remaining_opt_len; u64 temp; /* temp and everything after is not * initialized to 0 before calling * the BPF program. New fields that * should be initialized to 0 should * be inserted before temp. * temp is scratch storage used by * sock_ops_convert_ctx_access * as temporary storage of a register. */ }; struct bpf_sysctl_kern { struct ctl_table_header *head; struct ctl_table *table; void *cur_val; size_t cur_len; void *new_val; size_t new_len; int new_updated; int write; loff_t *ppos; /* Temporary "register" for indirect stores to ppos. */ u64 tmp_reg; }; #define BPF_SOCKOPT_KERN_BUF_SIZE 32 struct bpf_sockopt_buf { u8 data[BPF_SOCKOPT_KERN_BUF_SIZE]; }; struct bpf_sockopt_kern { struct sock *sk; u8 *optval; u8 *optval_end; s32 level; s32 optname; s32 optlen; /* for retval in struct bpf_cg_run_ctx */ struct task_struct *current_task; /* Temporary "register" for indirect stores to ppos. */ u64 tmp_reg; }; int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len); struct bpf_sk_lookup_kern { u16 family; u16 protocol; __be16 sport; u16 dport; struct { __be32 saddr; __be32 daddr; } v4; struct { const struct in6_addr *saddr; const struct in6_addr *daddr; } v6; struct sock *selected_sk; u32 ingress_ifindex; bool no_reuseport; }; extern struct static_key_false bpf_sk_lookup_enabled; /* Runners for BPF_SK_LOOKUP programs to invoke on socket lookup. * * Allowed return values for a BPF SK_LOOKUP program are SK_PASS and * SK_DROP. Their meaning is as follows: * * SK_PASS && ctx.selected_sk != NULL: use selected_sk as lookup result * SK_PASS && ctx.selected_sk == NULL: continue to htable-based socket lookup * SK_DROP : terminate lookup with -ECONNREFUSED * * This macro aggregates return values and selected sockets from * multiple BPF programs according to following rules in order: * * 1. If any program returned SK_PASS and a non-NULL ctx.selected_sk, * macro result is SK_PASS and last ctx.selected_sk is used. * 2. If any program returned SK_DROP return value, * macro result is SK_DROP. * 3. Otherwise result is SK_PASS and ctx.selected_sk is NULL. * * Caller must ensure that the prog array is non-NULL, and that the * array as well as the programs it contains remain valid. */ #define BPF_PROG_SK_LOOKUP_RUN_ARRAY(array, ctx, func) \ ({ \ struct bpf_sk_lookup_kern *_ctx = &(ctx); \ struct bpf_prog_array_item *_item; \ struct sock *_selected_sk = NULL; \ bool _no_reuseport = false; \ struct bpf_prog *_prog; \ bool _all_pass = true; \ u32 _ret; \ \ migrate_disable(); \ _item = &(array)->items[0]; \ while ((_prog = READ_ONCE(_item->prog))) { \ /* restore most recent selection */ \ _ctx->selected_sk = _selected_sk; \ _ctx->no_reuseport = _no_reuseport; \ \ _ret = func(_prog, _ctx); \ if (_ret == SK_PASS && _ctx->selected_sk) { \ /* remember last non-NULL socket */ \ _selected_sk = _ctx->selected_sk; \ _no_reuseport = _ctx->no_reuseport; \ } else if (_ret == SK_DROP && _all_pass) { \ _all_pass = false; \ } \ _item++; \ } \ _ctx->selected_sk = _selected_sk; \ _ctx->no_reuseport = _no_reuseport; \ migrate_enable(); \ _all_pass || _selected_sk ? SK_PASS : SK_DROP; \ }) static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 dport, const int ifindex, struct sock **psk) { struct bpf_prog_array *run_array; struct sock *selected_sk = NULL; bool no_reuseport = false; rcu_read_lock(); run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]); if (run_array) { struct bpf_sk_lookup_kern ctx = { .family = AF_INET, .protocol = protocol, .v4.saddr = saddr, .v4.daddr = daddr, .sport = sport, .dport = dport, .ingress_ifindex = ifindex, }; u32 act; act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; } else { selected_sk = ERR_PTR(-ECONNREFUSED); } } rcu_read_unlock(); *psk = selected_sk; return no_reuseport; } #if IS_ENABLED(CONFIG_IPV6) static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 dport, const int ifindex, struct sock **psk) { struct bpf_prog_array *run_array; struct sock *selected_sk = NULL; bool no_reuseport = false; rcu_read_lock(); run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]); if (run_array) { struct bpf_sk_lookup_kern ctx = { .family = AF_INET6, .protocol = protocol, .v6.saddr = saddr, .v6.daddr = daddr, .sport = sport, .dport = dport, .ingress_ifindex = ifindex, }; u32 act; act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; } else { selected_sk = ERR_PTR(-ECONNREFUSED); } } rcu_read_unlock(); *psk = selected_sk; return no_reuseport; } #endif /* IS_ENABLED(CONFIG_IPV6) */ static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, u64 flags, const u64 flag_mask, void *lookup_elem(struct bpf_map *map, u32 key)) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX; /* Lower bits of the flags are used as return code on lookup failure */ if (unlikely(flags & ~(action_mask | flag_mask))) return XDP_ABORTED; ri->tgt_value = lookup_elem(map, index); if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) { /* If the lookup fails we want to clear out the state in the * redirect_info struct completely, so that if an eBPF program * performs multiple lookups, the last one always takes * precedence. */ ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; return flags & action_mask; } ri->tgt_index = index; ri->map_id = map->id; ri->map_type = map->map_type; if (flags & BPF_F_BROADCAST) { WRITE_ONCE(ri->map, map); ri->flags = flags; } else { WRITE_ONCE(ri->map, NULL); ri->flags = 0; } return XDP_REDIRECT; } #ifdef CONFIG_NET int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len); int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags); int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) { return -EOPNOTSUPP; } static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) { return -EOPNOTSUPP; } static inline int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return -EOPNOTSUPP; } static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return -EOPNOTSUPP; } static inline void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) { return NULL; } static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush) { } #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */
8648 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 /* SPDX-License-Identifier: GPL-2.0 */ /* rwsem.h: R/W semaphores, public interface * * Written by David Howells (dhowells@redhat.com). * Derived from asm-i386/semaphore.h */ #ifndef _LINUX_RWSEM_H #define _LINUX_RWSEM_H #include <linux/linkage.h> #include <linux/types.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/err.h> #include <linux/cleanup.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC # define __RWSEM_DEP_MAP_INIT(lockname) \ .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_SLEEP, \ }, #else # define __RWSEM_DEP_MAP_INIT(lockname) #endif #ifndef CONFIG_PREEMPT_RT #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #include <linux/osq_lock.h> #endif /* * For an uncontended rwsem, count and owner are the only fields a task * needs to touch when acquiring the rwsem. So they are put next to each * other to increase the chance that they will share the same cacheline. * * In a contended rwsem, the owner is likely the most frequently accessed * field in the structure as the optimistic waiter that holds the osq lock * will spin on owner. For an embedded rwsem, other hot fields in the * containing structure should be moved further away from the rwsem to * reduce the chance that they will share the same cacheline causing * cacheline bouncing problem. */ struct rw_semaphore { atomic_long_t count; /* * Write owner or one of the read owners as well flags regarding * the current state of the rwsem. Can be used as a speculative * check to see if the write owner is running on the cpu. */ atomic_long_t owner; #ifdef CONFIG_RWSEM_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_RWSEMS void *magic; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; /* In all implementations count != 0 means locked */ static inline int rwsem_is_locked(struct rw_semaphore *sem) { return atomic_long_read(&sem->count) != 0; } #define RWSEM_UNLOCKED_VALUE 0L #define __RWSEM_COUNT_INIT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) /* Common initializer macros and functions */ #ifdef CONFIG_DEBUG_RWSEMS # define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname, #else # define __RWSEM_DEBUG_INIT(lockname) #endif #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #define __RWSEM_OPT_INIT(lockname) .osq = OSQ_LOCK_UNLOCKED, #else #define __RWSEM_OPT_INIT(lockname) #endif #define __RWSEM_INITIALIZER(name) \ { __RWSEM_COUNT_INIT(name), \ .owner = ATOMIC_LONG_INIT(0), \ __RWSEM_OPT_INIT(name) \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\ .wait_list = LIST_HEAD_INIT((name).wait_list), \ __RWSEM_DEBUG_INIT(name) \ __RWSEM_DEP_MAP_INIT(name) } #define DECLARE_RWSEM(name) \ struct rw_semaphore name = __RWSEM_INITIALIZER(name) extern void __init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key); #define init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ __init_rwsem((sem), #sem, &__key); \ } while (0) /* * This is the same regardless of which rwsem implementation that is being used. * It is just a heuristic meant to be called by somebody already holding the * rwsem to see if somebody from an incompatible type is wanting access to the * lock. */ static inline int rwsem_is_contended(struct rw_semaphore *sem) { return !list_empty(&sem->wait_list); } #else /* !CONFIG_PREEMPT_RT */ #include <linux/rwbase_rt.h> struct rw_semaphore { struct rwbase_rt rwbase; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; #define __RWSEM_INITIALIZER(name) \ { \ .rwbase = __RWBASE_INITIALIZER(name), \ __RWSEM_DEP_MAP_INIT(name) \ } #define DECLARE_RWSEM(lockname) \ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) extern void __init_rwsem(struct rw_semaphore *rwsem, const char *name, struct lock_class_key *key); #define init_rwsem(sem) \ do { \ static struct lock_class_key __key; \ \ __init_rwsem((sem), #sem, &__key); \ } while (0) static __always_inline int rwsem_is_locked(struct rw_semaphore *sem) { return rw_base_is_locked(&sem->rwbase); } static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) { return rw_base_is_contended(&sem->rwbase); } #endif /* CONFIG_PREEMPT_RT */ /* * The functions below are the same for all rwsem implementations including * the RT specific variant. */ /* * lock for reading */ extern void down_read(struct rw_semaphore *sem); extern int __must_check down_read_interruptible(struct rw_semaphore *sem); extern int __must_check down_read_killable(struct rw_semaphore *sem); /* * trylock for reading -- returns 1 if successful, 0 if contention */ extern int down_read_trylock(struct rw_semaphore *sem); /* * lock for writing */ extern void down_write(struct rw_semaphore *sem); extern int __must_check down_write_killable(struct rw_semaphore *sem); /* * trylock for writing -- returns 1 if successful, 0 if contention */ extern int down_write_trylock(struct rw_semaphore *sem); /* * release a read lock */ extern void up_read(struct rw_semaphore *sem); /* * release a write lock */ extern void up_write(struct rw_semaphore *sem); DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T)) DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T)) DEFINE_FREE(up_read, struct rw_semaphore *, if (_T) up_read(_T)) DEFINE_FREE(up_write, struct rw_semaphore *, if (_T) up_write(_T)) /* * downgrade write lock to read lock */ extern void downgrade_write(struct rw_semaphore *sem); #ifdef CONFIG_DEBUG_LOCK_ALLOC /* * nested locking. NOTE: rwsems are not allowed to recurse * (which occurs if the same task tries to acquire the same * lock instance multiple times), but multiple locks of the * same lock class might be taken, if the order of the locks * is always the same. This ordering rule can be expressed * to lockdep via the _nested() APIs, but enumerating the * subclasses that are used. (If the nesting relationship is * static then another method for expressing nested locking is * the explicit definition of lock class keys and the use of * lockdep_set_class() at lock initialization time. * See Documentation/locking/lockdep-design.rst for more details.) */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass); extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock); # define down_write_nest_lock(sem, nest_lock) \ do { \ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ _down_write_nest_lock(sem, &(nest_lock)->dep_map); \ } while (0) /* * Take/release a lock when not the owner will release it. * * [ This API should be avoided as much as possible - the * proper abstraction for this case is completions. ] */ extern void down_read_non_owner(struct rw_semaphore *sem); extern void up_read_non_owner(struct rw_semaphore *sem); #else # define down_read_nested(sem, subclass) down_read(sem) # define down_read_killable_nested(sem, subclass) down_read_killable(sem) # define down_write_nest_lock(sem, nest_lock) down_write(sem) # define down_write_nested(sem, subclass) down_write(sem) # define down_write_killable_nested(sem, subclass) down_write_killable(sem) # define down_read_non_owner(sem) down_read(sem) # define up_read_non_owner(sem) up_read(sem) #endif #endif /* _LINUX_RWSEM_H */
26 26 26 26 26 26 26 26 68 68 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 25 25 25 5 34 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP * * Copyright (c) 2020, Red Hat, Inc. */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/inet.h> #include <linux/kernel.h> #include <net/tcp.h> #include <net/inet_common.h> #include <net/netns/generic.h> #include <net/mptcp.h> #include <net/genetlink.h> #include <uapi/linux/mptcp.h> #include "protocol.h" #include "mib.h" /* forward declaration */ static struct genl_family mptcp_genl_family; static int pm_nl_pernet_id; struct mptcp_pm_add_entry { struct list_head list; struct mptcp_addr_info addr; u8 retrans_times; struct timer_list add_timer; struct mptcp_sock *sock; }; struct pm_nl_pernet { /* protects pernet updates */ spinlock_t lock; struct list_head local_addr_list; unsigned int addrs; unsigned int stale_loss_cnt; unsigned int add_addr_signal_max; unsigned int add_addr_accept_max; unsigned int local_addr_max; unsigned int subflows_max; unsigned int next_id; DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); }; #define MPTCP_PM_ADDR_MAX 8 #define ADD_ADDR_RETRANS_MAX 3 static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net) { return net_generic(net, pm_nl_pernet_id); } static struct pm_nl_pernet * pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk) { return pm_nl_get_pernet(sock_net((struct sock *)msk)); } bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port) { bool addr_equals = false; if (a->family == b->family) { if (a->family == AF_INET) addr_equals = a->addr.s_addr == b->addr.s_addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6); } else if (a->family == AF_INET) { if (ipv6_addr_v4mapped(&b->addr6)) addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3]; } else if (b->family == AF_INET) { if (ipv6_addr_v4mapped(&a->addr6)) addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr; #endif } if (!addr_equals) return false; if (!use_port) return true; return a->port == b->port; } void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr) { addr->family = skc->skc_family; addr->port = htons(skc->skc_num); if (addr->family == AF_INET) addr->addr.s_addr = skc->skc_rcv_saddr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6) addr->addr6 = skc->skc_v6_rcv_saddr; #endif } static void remote_address(const struct sock_common *skc, struct mptcp_addr_info *addr) { addr->family = skc->skc_family; addr->port = skc->skc_dport; if (addr->family == AF_INET) addr->addr.s_addr = skc->skc_daddr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6) addr->addr6 = skc->skc_v6_daddr; #endif } static bool lookup_subflow_by_saddr(const struct list_head *list, const struct mptcp_addr_info *saddr) { struct mptcp_subflow_context *subflow; struct mptcp_addr_info cur; struct sock_common *skc; list_for_each_entry(subflow, list, node) { skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); mptcp_local_address(skc, &cur); if (mptcp_addresses_equal(&cur, saddr, saddr->port)) return true; } return false; } static bool lookup_subflow_by_daddr(const struct list_head *list, const struct mptcp_addr_info *daddr) { struct mptcp_subflow_context *subflow; struct mptcp_addr_info cur; struct sock_common *skc; list_for_each_entry(subflow, list, node) { skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); remote_address(skc, &cur); if (mptcp_addresses_equal(&cur, daddr, daddr->port)) return true; } return false; } static struct mptcp_pm_addr_entry * select_local_address(const struct pm_nl_pernet *pernet, const struct mptcp_sock *msk) { struct mptcp_pm_addr_entry *entry, *ret = NULL; msk_owned_by_me(msk); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) continue; ret = entry; break; } rcu_read_unlock(); return ret; } static struct mptcp_pm_addr_entry * select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk) { struct mptcp_pm_addr_entry *entry, *ret = NULL; rcu_read_lock(); /* do not keep any additional per socket state, just signal * the address list in order. * Note: removal from the local address list during the msk life-cycle * can lead to additional addresses not being announced. */ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) continue; if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) continue; ret = entry; break; } rcu_read_unlock(); return ret; } unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) { const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); return READ_ONCE(pernet->add_addr_signal_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); return READ_ONCE(pernet->add_addr_accept_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); return READ_ONCE(pernet->subflows_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); return READ_ONCE(pernet->local_addr_max); } EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { WRITE_ONCE(msk->pm.work_pending, false); return false; } return true; } struct mptcp_pm_add_entry * mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; lockdep_assert_held(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.anno_list, list) { if (mptcp_addresses_equal(&entry->addr, addr, true)) return entry; } return NULL; } bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) { struct mptcp_pm_add_entry *entry; struct mptcp_addr_info saddr; bool ret = false; mptcp_local_address((struct sock_common *)sk, &saddr); spin_lock_bh(&msk->pm.lock); list_for_each_entry(entry, &msk->pm.anno_list, list) { if (mptcp_addresses_equal(&entry->addr, &saddr, true)) { ret = true; goto out; } } out: spin_unlock_bh(&msk->pm.lock); return ret; } static void mptcp_pm_add_timer(struct timer_list *timer) { struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer); struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; pr_debug("msk=%p", msk); if (!msk) return; if (inet_sk_state_load(sk) == TCP_CLOSE) return; if (!entry->addr.id) return; if (mptcp_pm_should_add_signal_addr(msk)) { sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); goto out; } spin_lock_bh(&msk->pm.lock); if (!mptcp_pm_should_add_signal_addr(msk)) { pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id); mptcp_pm_announce_addr(msk, &entry->addr, false); mptcp_pm_add_addr_send_ack(msk); entry->retrans_times++; } if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) sk_reset_timer(sk, timer, jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); spin_unlock_bh(&msk->pm.lock); if (entry->retrans_times == ADD_ADDR_RETRANS_MAX) mptcp_pm_subflow_established(msk); out: __sock_put(sk); } struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool check_id) { struct mptcp_pm_add_entry *entry; struct sock *sk = (struct sock *)msk; spin_lock_bh(&msk->pm.lock); entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (entry && (!check_id || entry->addr.id == addr->id)) entry->retrans_times = ADD_ADDR_RETRANS_MAX; spin_unlock_bh(&msk->pm.lock); if (entry && (!check_id || entry->addr.id == addr->id)) sk_stop_timer_sync(sk, &entry->add_timer); return entry; } bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; struct net *net = sock_net(sk); lockdep_assert_held(&msk->pm.lock); add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (add_entry) { if (mptcp_pm_is_kernel(msk)) return false; sk_reset_timer(sk, &add_entry->add_timer, jiffies + mptcp_get_add_addr_timeout(net)); return true; } add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); if (!add_entry) return false; list_add(&add_entry->list, &msk->pm.anno_list); add_entry->addr = *addr; add_entry->sock = msk; add_entry->retrans_times = 0; timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); sk_reset_timer(sk, &add_entry->add_timer, jiffies + mptcp_get_add_addr_timeout(net)); return true; } void mptcp_pm_free_anno_list(struct mptcp_sock *msk) { struct mptcp_pm_add_entry *entry, *tmp; struct sock *sk = (struct sock *)msk; LIST_HEAD(free_list); pr_debug("msk=%p", msk); spin_lock_bh(&msk->pm.lock); list_splice_init(&msk->pm.anno_list, &free_list); spin_unlock_bh(&msk->pm.lock); list_for_each_entry_safe(entry, tmp, &free_list, list) { sk_stop_timer_sync(sk, &entry->add_timer); kfree(entry); } } static bool lookup_address_in_vec(const struct mptcp_addr_info *addrs, unsigned int nr, const struct mptcp_addr_info *addr) { int i; for (i = 0; i < nr; i++) { if (addrs[i].id == addr->id) return true; } return false; } /* Fill all the remote addresses into the array addrs[], * and return the array size. */ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *local, bool fullmesh, struct mptcp_addr_info *addrs) { bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); struct sock *sk = (struct sock *)msk, *ssk; struct mptcp_subflow_context *subflow; struct mptcp_addr_info remote = { 0 }; unsigned int subflows_max; int i = 0; subflows_max = mptcp_pm_get_subflows_max(msk); remote_address((struct sock_common *)sk, &remote); /* Non-fullmesh endpoint, fill in the single entry * corresponding to the primary MPC subflow remote address */ if (!fullmesh) { if (deny_id0) return 0; if (!mptcp_pm_addr_families_match(sk, local, &remote)) return 0; msk->pm.subflows++; addrs[i++] = remote; } else { mptcp_for_each_subflow(msk, subflow) { ssk = mptcp_subflow_tcp_sock(subflow); remote_address((struct sock_common *)ssk, &addrs[i]); addrs[i].id = subflow->remote_id; if (deny_id0 && !addrs[i].id) continue; if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) continue; if (!lookup_address_in_vec(addrs, i, &addrs[i]) && msk->pm.subflows < subflows_max) { msk->pm.subflows++; i++; } } } return i; } static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, bool prio, bool backup) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); bool slow; pr_debug("send ack for %s", prio ? "mp_prio" : (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr")); slow = lock_sock_fast(ssk); if (prio) { subflow->send_mp_prio = 1; subflow->backup = backup; subflow->request_bkup = backup; } __mptcp_subflow_send_ack(ssk); unlock_sock_fast(ssk, slow); } static void mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, bool prio, bool backup) { spin_unlock_bh(&msk->pm.lock); __mptcp_pm_send_ack(msk, subflow, prio, backup); spin_lock_bh(&msk->pm.lock); } static struct mptcp_pm_addr_entry * __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) { struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, &pernet->local_addr_list, list) { if (entry->addr.id == id) return entry; } return NULL; } static struct mptcp_pm_addr_entry * __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info, bool lookup_by_id) { struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, &pernet->local_addr_list, list) { if ((!lookup_by_id && mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) || (lookup_by_id && entry->addr.id == info->id)) return entry; } return NULL; } static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *local; unsigned int add_addr_signal_max; unsigned int local_addr_max; struct pm_nl_pernet *pernet; unsigned int subflows_max; pernet = pm_nl_get_pernet(sock_net(sk)); add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); local_addr_max = mptcp_pm_get_local_addr_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk); /* do lazy endpoint usage accounting for the MPC subflows */ if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first); struct mptcp_pm_addr_entry *entry; struct mptcp_addr_info mpc_addr; bool backup = false; mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); rcu_read_lock(); entry = __lookup_addr(pernet, &mpc_addr, false); if (entry) { __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); msk->mpc_endpoint_id = entry->addr.id; backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); } rcu_read_unlock(); if (backup) mptcp_pm_send_ack(msk, subflow, true, backup); msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); } pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, msk->pm.add_addr_signaled, add_addr_signal_max, msk->pm.subflows, subflows_max); /* check first for announce */ if (msk->pm.add_addr_signaled < add_addr_signal_max) { local = select_signal_address(pernet, msk); /* due to racing events on both ends we can reach here while * previous add address is still running: if we invoke now * mptcp_pm_announce_addr(), that will fail and the * corresponding id will be marked as used. * Instead let the PM machinery reschedule us when the * current address announce will be completed. */ if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) return; if (local) { if (mptcp_pm_alloc_anno_list(msk, &local->addr)) { __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &local->addr, false); mptcp_pm_nl_addr_send_ack(msk); } } } /* check if should create a new subflow */ while (msk->pm.local_addr_used < local_addr_max && msk->pm.subflows < subflows_max) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; int i, nr; local = select_local_address(pernet, msk); if (!local) break; fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH); msk->pm.local_addr_used++; __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); nr = fill_remote_addresses_vec(msk, &local->addr, fullmesh, addrs); if (nr == 0) continue; spin_unlock_bh(&msk->pm.lock); for (i = 0; i < nr; i++) __mptcp_subflow_connect(sk, &local->addr, &addrs[i]); spin_lock_bh(&msk->pm.lock); } mptcp_pm_nl_check_work_pending(msk); } static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) { mptcp_pm_create_subflow_or_signal_addr(msk); } static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) { mptcp_pm_create_subflow_or_signal_addr(msk); } /* Fill all the local addresses into the array addrs[], * and return the array size. */ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote, struct mptcp_addr_info *addrs) { struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; unsigned int subflows_max; int i = 0; pernet = pm_nl_get_pernet_from_msk(msk); subflows_max = mptcp_pm_get_subflows_max(msk); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) continue; if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) continue; if (msk->pm.subflows < subflows_max) { msk->pm.subflows++; addrs[i++] = entry->addr; } } rcu_read_unlock(); /* If the array is empty, fill in the single * 'IPADDRANY' local address */ if (!i) { struct mptcp_addr_info local; memset(&local, 0, sizeof(local)); local.family = #if IS_ENABLED(CONFIG_MPTCP_IPV6) remote->family == AF_INET6 && ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : #endif remote->family; if (!mptcp_pm_addr_families_match(sk, &local, remote)) return 0; msk->pm.subflows++; addrs[i++] = local; } return i; } static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; unsigned int add_addr_accept_max; struct mptcp_addr_info remote; unsigned int subflows_max; int i, nr; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk); pr_debug("accepted %d:%d remote family %d", msk->pm.add_addr_accepted, add_addr_accept_max, msk->pm.remote.family); remote = msk->pm.remote; mptcp_pm_announce_addr(msk, &remote, true); mptcp_pm_nl_addr_send_ack(msk); if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) return; /* pick id 0 port, if none is provided the remote address */ if (!remote.port) remote.port = sk->sk_dport; /* connect to the specified remote address, using whatever * local address the routing configuration will pick. */ nr = fill_local_addresses_vec(msk, &remote, addrs); if (nr == 0) return; msk->pm.add_addr_accepted++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || msk->pm.subflows >= subflows_max) WRITE_ONCE(msk->pm.accept_addr, false); spin_unlock_bh(&msk->pm.lock); for (i = 0; i < nr; i++) __mptcp_subflow_connect(sk, &addrs[i], &remote); spin_lock_bh(&msk->pm.lock); } void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; msk_owned_by_me(msk); lockdep_assert_held(&msk->pm.lock); if (!mptcp_pm_should_add_signal(msk) && !mptcp_pm_should_rm_signal(msk)) return; subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); if (subflow) mptcp_pm_send_ack(msk, subflow, false, false); } int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct mptcp_addr_info *addr, struct mptcp_addr_info *rem, u8 bkup) { struct mptcp_subflow_context *subflow; pr_debug("bkup=%d", bkup); mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct mptcp_addr_info local, remote; mptcp_local_address((struct sock_common *)ssk, &local); if (!mptcp_addresses_equal(&local, addr, addr->port)) continue; if (rem && rem->family != AF_UNSPEC) { remote_address((struct sock_common *)ssk, &remote); if (!mptcp_addresses_equal(&remote, rem, rem->port)) continue; } __mptcp_pm_send_ack(msk, subflow, true, bkup); return 0; } return -EINVAL; } static bool mptcp_local_id_match(const struct mptcp_sock *msk, u8 local_id, u8 id) { return local_id == id || (!local_id && msk->mpc_endpoint_id == id); } static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list, enum linux_mptcp_mib_field rm_type) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; u8 i; pr_debug("%s rm_list_nr %d", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr); msk_owned_by_me(msk); if (sk->sk_state == TCP_LISTEN) return; if (!rm_list->nr) return; if (list_empty(&msk->conn_list)) return; for (i = 0; i < rm_list->nr; i++) { u8 rm_id = rm_list->ids[i]; bool removed = false; mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; u8 id = subflow->local_id; if (rm_type == MPTCP_MIB_RMADDR && subflow->remote_id != rm_id) continue; if (rm_type == MPTCP_MIB_RMSUBFLOW && !mptcp_local_id_match(msk, id, rm_id)) continue; pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u", rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", i, rm_id, subflow->local_id, subflow->remote_id, msk->mpc_endpoint_id); spin_unlock_bh(&msk->pm.lock); mptcp_subflow_shutdown(sk, ssk, how); /* the following takes care of updating the subflows counter */ mptcp_close_ssk(sk, ssk, subflow); spin_lock_bh(&msk->pm.lock); removed = true; __MPTCP_INC_STATS(sock_net(sk), rm_type); } if (rm_type == MPTCP_MIB_RMSUBFLOW) __set_bit(rm_id ? rm_id : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap); if (!removed) continue; if (!mptcp_pm_is_kernel(msk)) continue; if (rm_type == MPTCP_MIB_RMADDR) { msk->pm.add_addr_accepted--; WRITE_ONCE(msk->pm.accept_addr, true); } else if (rm_type == MPTCP_MIB_RMSUBFLOW) { msk->pm.local_addr_used--; } } } static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) { mptcp_pm_nl_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); } void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); } void mptcp_pm_nl_work(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; msk_owned_by_me(msk); if (!(pm->status & MPTCP_PM_WORK_MASK)) return; spin_lock_bh(&msk->pm.lock); pr_debug("msk=%p status=%x", msk, pm->status); if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); mptcp_pm_nl_add_addr_received(msk); } if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); mptcp_pm_nl_addr_send_ack(msk); } if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); mptcp_pm_nl_rm_addr_received(msk); } if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); mptcp_pm_nl_fully_established(msk); } if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); mptcp_pm_nl_subflow_established(msk); } spin_unlock_bh(&msk->pm.lock); } static bool address_use_port(struct mptcp_pm_addr_entry *entry) { return (entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) == MPTCP_PM_ADDR_FLAG_SIGNAL; } /* caller must ensure the RCU grace period is already elapsed */ static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) { if (entry->lsk) sock_release(entry->lsk); kfree(entry); } static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, struct mptcp_pm_addr_entry *entry) { struct mptcp_pm_addr_entry *cur, *del_entry = NULL; unsigned int addr_max; int ret = -EINVAL; spin_lock_bh(&pernet->lock); /* to keep the code simple, don't do IDR-like allocation for address ID, * just bail when we exceed limits */ if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID) pernet->next_id = 1; if (pernet->addrs >= MPTCP_PM_ADDR_MAX) { ret = -ERANGE; goto out; } if (test_bit(entry->addr.id, pernet->id_bitmap)) { ret = -EBUSY; goto out; } /* do not insert duplicate address, differentiate on port only * singled addresses */ if (!address_use_port(entry)) entry->addr.port = 0; list_for_each_entry(cur, &pernet->local_addr_list, list) { if (mptcp_addresses_equal(&cur->addr, &entry->addr, cur->addr.port || entry->addr.port)) { /* allow replacing the exiting endpoint only if such * endpoint is an implicit one and the user-space * did not provide an endpoint id */ if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) { ret = -EEXIST; goto out; } if (entry->addr.id) goto out; pernet->addrs--; entry->addr.id = cur->addr.id; list_del_rcu(&cur->list); del_entry = cur; break; } } if (!entry->addr.id) { find_next: entry->addr.id = find_next_zero_bit(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, pernet->next_id); if (!entry->addr.id && pernet->next_id != 1) { pernet->next_id = 1; goto find_next; } } if (!entry->addr.id) goto out; __set_bit(entry->addr.id, pernet->id_bitmap); if (entry->addr.id > pernet->next_id) pernet->next_id = entry->addr.id; if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { addr_max = pernet->add_addr_signal_max; WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; WRITE_ONCE(pernet->local_addr_max, addr_max + 1); } pernet->addrs++; if (!entry->addr.port) list_add_tail_rcu(&entry->list, &pernet->local_addr_list); else list_add_rcu(&entry->list, &pernet->local_addr_list); ret = entry->addr.id; out: spin_unlock_bh(&pernet->lock); /* just replaced an existing entry, free it */ if (del_entry) { synchronize_rcu(); __mptcp_pm_release_addr_entry(del_entry); } return ret; } static struct lock_class_key mptcp_slock_keys[2]; static struct lock_class_key mptcp_keys[2]; static int mptcp_pm_nl_create_listen_socket(struct sock *sk, struct mptcp_pm_addr_entry *entry) { bool is_ipv6 = sk->sk_family == AF_INET6; int addrlen = sizeof(struct sockaddr_in); struct sockaddr_storage addr; struct sock *newsk, *ssk; int backlog = 1024; int err; err = sock_create_kern(sock_net(sk), entry->addr.family, SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk); if (err) return err; newsk = entry->lsk->sk; if (!newsk) return -EINVAL; /* The subflow socket lock is acquired in a nested to the msk one * in several places, even by the TCP stack, and this msk is a kernel * socket: lockdep complains. Instead of propagating the _nested * modifiers in several places, re-init the lock class for the msk * socket to an mptcp specific one. */ sock_lock_init_class_and_name(newsk, is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET", &mptcp_slock_keys[is_ipv6], is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET", &mptcp_keys[is_ipv6]); lock_sock(newsk); ssk = __mptcp_nmpc_sk(mptcp_sk(newsk)); release_sock(newsk); if (IS_ERR(ssk)) return PTR_ERR(ssk); mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (entry->addr.family == AF_INET6) addrlen = sizeof(struct sockaddr_in6); #endif if (ssk->sk_family == AF_INET) err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ssk->sk_family == AF_INET6) err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); #endif if (err) return err; inet_sk_state_store(newsk, TCP_LISTEN); lock_sock(ssk); err = __inet_listen_sk(ssk, backlog); if (!err) mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); release_sock(ssk); return err; } int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; int ret = -1; pernet = pm_nl_get_pernet_from_msk(msk); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { if (mptcp_addresses_equal(&entry->addr, skc, entry->addr.port)) { ret = entry->addr.id; break; } } rcu_read_unlock(); if (ret >= 0) return ret; /* address not found, add to local list */ entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; entry->addr = *skc; entry->addr.id = 0; entry->addr.port = 0; entry->ifindex = 0; entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; entry->lsk = NULL; ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); if (ret < 0) kfree(entry); return ret; } #define MPTCP_PM_CMD_GRP_OFFSET 0 #define MPTCP_PM_EV_GRP_OFFSET 1 static const struct genl_multicast_group mptcp_pm_mcgrps[] = { [MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, }, [MPTCP_PM_EV_GRP_OFFSET] = { .name = MPTCP_PM_EV_GRP_NAME, .flags = GENL_UNS_ADMIN_PERM, }, }; static const struct nla_policy mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = { [MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, }, [MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, }, [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, }, [MPTCP_PM_ADDR_ATTR_ADDR6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16 }, [MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32 }, [MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32 }, }; static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = { [MPTCP_PM_ATTR_ADDR] = NLA_POLICY_NESTED(mptcp_pm_addr_policy), [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, }, [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, }, [MPTCP_PM_ATTR_TOKEN] = { .type = NLA_U32, }, [MPTCP_PM_ATTR_LOC_ID] = { .type = NLA_U8, }, [MPTCP_PM_ATTR_ADDR_REMOTE] = NLA_POLICY_NESTED(mptcp_pm_addr_policy), }; void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; unsigned int active_max_loss_cnt; struct net *net = sock_net(sk); unsigned int stale_loss_cnt; bool slow; stale_loss_cnt = mptcp_stale_loss_cnt(net); if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt) return; /* look for another available subflow not in loss state */ active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1); mptcp_for_each_subflow(msk, iter) { if (iter != subflow && mptcp_subflow_active(iter) && iter->stale_count < active_max_loss_cnt) { /* we have some alternatives, try to mark this subflow as idle ...*/ slow = lock_sock_fast(ssk); if (!tcp_rtx_and_write_queues_empty(ssk)) { subflow->stale = 1; __mptcp_retransmit_pending_data(sk); MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE); } unlock_sock_fast(ssk, slow); /* always try to push the pending data regardless of re-injections: * we can possibly use backup subflows now, and subflow selection * is cheap under the msk socket lock */ __mptcp_push_pending(sk, 0); return; } } } static int mptcp_pm_family_to_addr(int family) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (family == AF_INET6) return MPTCP_PM_ADDR_ATTR_ADDR6; #endif return MPTCP_PM_ADDR_ATTR_ADDR4; } static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[], const struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr, bool require_family) { int err, addr_addr; if (!attr) { GENL_SET_ERR_MSG(info, "missing address info"); return -EINVAL; } /* no validation needed - was already done via nested policy */ err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, mptcp_pm_addr_policy, info->extack); if (err) return err; if (tb[MPTCP_PM_ADDR_ATTR_ID]) addr->id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) { if (!require_family) return 0; NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing family"); return -EINVAL; } addr->family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]); if (addr->family != AF_INET #if IS_ENABLED(CONFIG_MPTCP_IPV6) && addr->family != AF_INET6 #endif ) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "unknown address family"); return -EINVAL; } addr_addr = mptcp_pm_family_to_addr(addr->family); if (!tb[addr_addr]) { NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing address data"); return -EINVAL; } #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr->family == AF_INET6) addr->addr6 = nla_get_in6_addr(tb[addr_addr]); else #endif addr->addr.s_addr = nla_get_in_addr(tb[addr_addr]); if (tb[MPTCP_PM_ADDR_ATTR_PORT]) addr->port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); return 0; } int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr) { struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; memset(addr, 0, sizeof(*addr)); return mptcp_pm_parse_pm_addr_attr(tb, attr, info, addr, true); } int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, bool require_family, struct mptcp_pm_addr_entry *entry) { struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; int err; memset(entry, 0, sizeof(*entry)); err = mptcp_pm_parse_pm_addr_attr(tb, attr, info, &entry->addr, require_family); if (err) return err; if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) { u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); entry->ifindex = val; } if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); if (tb[MPTCP_PM_ADDR_ATTR_PORT]) entry->addr.port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); return 0; } static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) { return pm_nl_get_pernet(genl_info_net(info)); } static int mptcp_nl_add_subflow_or_signal_addr(struct net *net) { struct mptcp_sock *msk; long s_slot = 0, s_num = 0; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; if (!READ_ONCE(msk->fully_established) || mptcp_pm_is_userspace(msk)) goto next; lock_sock(sk); spin_lock_bh(&msk->pm.lock); mptcp_pm_create_subflow_or_signal_addr(msk); spin_unlock_bh(&msk->pm.lock); release_sock(sk); next: sock_put(sk); cond_resched(); } return 0; } static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; int ret; ret = mptcp_pm_parse_entry(attr, info, true, &addr); if (ret < 0) return ret; if (addr.addr.port && !(addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { GENL_SET_ERR_MSG(info, "flags must have signal when using port"); return -EINVAL; } if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL && addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { GENL_SET_ERR_MSG(info, "flags mustn't have both signal and fullmesh"); return -EINVAL; } if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) { GENL_SET_ERR_MSG(info, "can't create IMPLICIT endpoint"); return -EINVAL; } entry = kzalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT); if (!entry) { GENL_SET_ERR_MSG(info, "can't allocate addr"); return -ENOMEM; } *entry = addr; if (entry->addr.port) { ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry); if (ret) { GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret); goto out_free; } } ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); if (ret < 0) { GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret); goto out_free; } mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk)); return 0; out_free: __mptcp_pm_release_addr_entry(entry); return ret; } int mptcp_pm_nl_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex) { struct mptcp_pm_addr_entry *entry; struct sock *sk = (struct sock *)msk; struct net *net = sock_net(sk); rcu_read_lock(); entry = __lookup_addr_by_id(pm_nl_get_pernet(net), id); if (entry) { *flags = entry->flags; *ifindex = entry->ifindex; } rcu_read_unlock(); return 0; } static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr) { struct mptcp_pm_add_entry *entry; entry = mptcp_pm_del_add_timer(msk, addr, false); if (entry) { list_del(&entry->list); kfree(entry); return true; } return false; } static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool force) { struct mptcp_rm_list list = { .nr = 0 }; bool ret; list.ids[list.nr++] = addr->id; ret = remove_anno_list_by_saddr(msk, addr); if (ret || force) { spin_lock_bh(&msk->pm.lock); mptcp_pm_remove_addr(msk, &list); spin_unlock_bh(&msk->pm.lock); } return ret; } static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, const struct mptcp_pm_addr_entry *entry) { const struct mptcp_addr_info *addr = &entry->addr; struct mptcp_rm_list list = { .nr = 0 }; long s_slot = 0, s_num = 0; struct mptcp_sock *msk; pr_debug("remove_id=%d", addr->id); list.ids[list.nr++] = addr->id; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; bool remove_subflow; if (mptcp_pm_is_userspace(msk)) goto next; if (list_empty(&msk->conn_list)) { mptcp_pm_remove_anno_addr(msk, addr, false); goto next; } lock_sock(sk); remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr); mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); if (remove_subflow) mptcp_pm_remove_subflow(msk, &list); release_sock(sk); next: sock_put(sk); cond_resched(); } return 0; } static int mptcp_nl_remove_id_zero_address(struct net *net, struct mptcp_addr_info *addr) { struct mptcp_rm_list list = { .nr = 0 }; long s_slot = 0, s_num = 0; struct mptcp_sock *msk; list.ids[list.nr++] = 0; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; struct mptcp_addr_info msk_local; if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) goto next; mptcp_local_address((struct sock_common *)msk, &msk_local); if (!mptcp_addresses_equal(&msk_local, addr, addr->port)) goto next; lock_sock(sk); spin_lock_bh(&msk->pm.lock); mptcp_pm_remove_addr(msk, &list); mptcp_pm_nl_rm_subflow_received(msk, &list); spin_unlock_bh(&msk->pm.lock); release_sock(sk); next: sock_put(sk); cond_resched(); } return 0; } static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; unsigned int addr_max; int ret; ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; /* the zero id address is special: the first address used by the msk * always gets such an id, so different subflows can have different zero * id addresses. Additionally zero id is not accounted for in id_bitmap. * Let's use an 'mptcp_rm_list' instead of the common remove code. */ if (addr.addr.id == 0) return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr); spin_lock_bh(&pernet->lock); entry = __lookup_addr_by_id(pernet, addr.addr.id); if (!entry) { GENL_SET_ERR_MSG(info, "address not found"); spin_unlock_bh(&pernet->lock); return -EINVAL; } if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { addr_max = pernet->add_addr_signal_max; WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; WRITE_ONCE(pernet->local_addr_max, addr_max - 1); } pernet->addrs--; list_del_rcu(&entry->list); __clear_bit(entry->addr.id, pernet->id_bitmap); spin_unlock_bh(&pernet->lock); mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry); synchronize_rcu(); __mptcp_pm_release_addr_entry(entry); return ret; } void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) { struct mptcp_rm_list alist = { .nr = 0 }; struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, rm_list, list) { remove_anno_list_by_saddr(msk, &entry->addr); if (alist.nr < MPTCP_RM_IDS_MAX) alist.ids[alist.nr++] = entry->addr.id; } if (alist.nr) { spin_lock_bh(&msk->pm.lock); mptcp_pm_remove_addr(msk, &alist); spin_unlock_bh(&msk->pm.lock); } } void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, struct list_head *rm_list) { struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, rm_list, list) { if (lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && slist.nr < MPTCP_RM_IDS_MAX) slist.ids[slist.nr++] = entry->addr.id; if (remove_anno_list_by_saddr(msk, &entry->addr) && alist.nr < MPTCP_RM_IDS_MAX) alist.ids[alist.nr++] = entry->addr.id; } if (alist.nr) { spin_lock_bh(&msk->pm.lock); mptcp_pm_remove_addr(msk, &alist); spin_unlock_bh(&msk->pm.lock); } if (slist.nr) mptcp_pm_remove_subflow(msk, &slist); } static void mptcp_nl_remove_addrs_list(struct net *net, struct list_head *rm_list) { long s_slot = 0, s_num = 0; struct mptcp_sock *msk; if (list_empty(rm_list)) return; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; if (!mptcp_pm_is_userspace(msk)) { lock_sock(sk); mptcp_pm_remove_addrs_and_subflows(msk, rm_list); release_sock(sk); } sock_put(sk); cond_resched(); } } /* caller must ensure the RCU grace period is already elapsed */ static void __flush_addrs(struct list_head *list) { while (!list_empty(list)) { struct mptcp_pm_addr_entry *cur; cur = list_entry(list->next, struct mptcp_pm_addr_entry, list); list_del_rcu(&cur->list); __mptcp_pm_release_addr_entry(cur); } } static void __reset_counters(struct pm_nl_pernet *pernet) { WRITE_ONCE(pernet->add_addr_signal_max, 0); WRITE_ONCE(pernet->add_addr_accept_max, 0); WRITE_ONCE(pernet->local_addr_max, 0); pernet->addrs = 0; } static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); LIST_HEAD(free_list); spin_lock_bh(&pernet->lock); list_splice_init(&pernet->local_addr_list, &free_list); __reset_counters(pernet); pernet->next_id = 1; bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); spin_unlock_bh(&pernet->lock); mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list); synchronize_rcu(); __flush_addrs(&free_list); return 0; } static int mptcp_nl_fill_addr(struct sk_buff *skb, struct mptcp_pm_addr_entry *entry) { struct mptcp_addr_info *addr = &entry->addr; struct nlattr *attr; attr = nla_nest_start(skb, MPTCP_PM_ATTR_ADDR); if (!attr) return -EMSGSIZE; if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family)) goto nla_put_failure; if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_PORT, ntohs(addr->port))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id)) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags)) goto nla_put_failure; if (entry->ifindex && nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex)) goto nla_put_failure; if (addr->family == AF_INET && nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4, addr->addr.s_addr)) goto nla_put_failure; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (addr->family == AF_INET6 && nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6)) goto nla_put_failure; #endif nla_nest_end(skb, attr); return 0; nla_put_failure: nla_nest_cancel(skb, attr); return -EMSGSIZE; } static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; struct sk_buff *msg; void *reply; int ret; ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, info->genlhdr->cmd); if (!reply) { GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); ret = -EMSGSIZE; goto fail; } spin_lock_bh(&pernet->lock); entry = __lookup_addr_by_id(pernet, addr.addr.id); if (!entry) { GENL_SET_ERR_MSG(info, "address not found"); ret = -EINVAL; goto unlock_fail; } ret = mptcp_nl_fill_addr(msg, entry); if (ret) goto unlock_fail; genlmsg_end(msg, reply); ret = genlmsg_reply(msg, info); spin_unlock_bh(&pernet->lock); return ret; unlock_fail: spin_unlock_bh(&pernet->lock); fail: nlmsg_free(msg); return ret; } static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg, struct netlink_callback *cb) { struct net *net = sock_net(msg->sk); struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; int id = cb->args[0]; void *hdr; int i; pernet = pm_nl_get_pernet(net); spin_lock_bh(&pernet->lock); for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { if (test_bit(i, pernet->id_bitmap)) { entry = __lookup_addr_by_id(pernet, i); if (!entry) break; if (entry->addr.id <= id) continue; hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &mptcp_genl_family, NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); if (!hdr) break; if (mptcp_nl_fill_addr(msg, entry) < 0) { genlmsg_cancel(msg, hdr); break; } id = entry->addr.id; genlmsg_end(msg, hdr); } } spin_unlock_bh(&pernet->lock); cb->args[0] = id; return msg->len; } static int parse_limit(struct genl_info *info, int id, unsigned int *limit) { struct nlattr *attr = info->attrs[id]; if (!attr) return 0; *limit = nla_get_u32(attr); if (*limit > MPTCP_PM_ADDR_MAX) { GENL_SET_ERR_MSG(info, "limit greater than maximum"); return -EINVAL; } return 0; } static int mptcp_nl_cmd_set_limits(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); unsigned int rcv_addrs, subflows; int ret; spin_lock_bh(&pernet->lock); rcv_addrs = pernet->add_addr_accept_max; ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); if (ret) goto unlock; subflows = pernet->subflows_max; ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); if (ret) goto unlock; WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); WRITE_ONCE(pernet->subflows_max, subflows); unlock: spin_unlock_bh(&pernet->lock); return ret; } static int mptcp_nl_cmd_get_limits(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct sk_buff *msg; void *reply; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, MPTCP_PM_CMD_GET_LIMITS); if (!reply) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, READ_ONCE(pernet->add_addr_accept_max))) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, READ_ONCE(pernet->subflows_max))) goto fail; genlmsg_end(msg, reply); return genlmsg_reply(msg, info); fail: GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); nlmsg_free(msg); return -EMSGSIZE; } static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, struct mptcp_addr_info *addr) { struct mptcp_rm_list list = { .nr = 0 }; list.ids[list.nr++] = addr->id; spin_lock_bh(&msk->pm.lock); mptcp_pm_nl_rm_subflow_received(msk, &list); mptcp_pm_create_subflow_or_signal_addr(msk); spin_unlock_bh(&msk->pm.lock); } static int mptcp_nl_set_flags(struct net *net, struct mptcp_addr_info *addr, u8 bkup, u8 changed) { long s_slot = 0, s_num = 0; struct mptcp_sock *msk; int ret = -EINVAL; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) goto next; lock_sock(sk); if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup); if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH) mptcp_pm_nl_fullmesh(msk, addr); release_sock(sk); next: sock_put(sk); cond_resched(); } return ret; } int mptcp_pm_nl_set_flags(struct net *net, struct mptcp_pm_addr_entry *addr, u8 bkup) { struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP | MPTCP_PM_ADDR_FLAG_FULLMESH; struct mptcp_pm_addr_entry *entry; u8 lookup_by_id = 0; if (addr->addr.family == AF_UNSPEC) { lookup_by_id = 1; if (!addr->addr.id) return -EOPNOTSUPP; } spin_lock_bh(&pernet->lock); entry = __lookup_addr(pernet, &addr->addr, lookup_by_id); if (!entry) { spin_unlock_bh(&pernet->lock); return -EINVAL; } if ((addr->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { spin_unlock_bh(&pernet->lock); return -EINVAL; } changed = (addr->flags ^ entry->flags) & mask; entry->flags = (entry->flags & ~mask) | (addr->flags & mask); *addr = *entry; spin_unlock_bh(&pernet->lock); mptcp_nl_set_flags(net, &addr->addr, bkup, changed); return 0; } static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) { struct mptcp_pm_addr_entry remote = { .addr = { .family = AF_UNSPEC }, }; struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }; struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct net *net = sock_net(skb->sk); u8 bkup = 0; int ret; ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; if (attr_rem) { ret = mptcp_pm_parse_entry(attr_rem, info, false, &remote); if (ret < 0) return ret; } if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) bkup = 1; return mptcp_pm_set_flags(net, token, &addr, &remote, bkup); } static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp) { genlmsg_multicast_netns(&mptcp_genl_family, net, nlskb, 0, MPTCP_PM_EV_GRP_OFFSET, gfp); } bool mptcp_userspace_pm_active(const struct mptcp_sock *msk) { return genl_has_listeners(&mptcp_genl_family, sock_net((const struct sock *)msk), MPTCP_PM_EV_GRP_OFFSET); } static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk) { const struct inet_sock *issk = inet_sk(ssk); const struct mptcp_subflow_context *sf; if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family)) return -EMSGSIZE; switch (ssk->sk_family) { case AF_INET: if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr)) return -EMSGSIZE; if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, issk->inet_daddr)) return -EMSGSIZE; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { const struct ipv6_pinfo *np = inet6_sk(ssk); if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) return -EMSGSIZE; if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &ssk->sk_v6_daddr)) return -EMSGSIZE; break; } #endif default: WARN_ON_ONCE(1); return -EMSGSIZE; } if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport)) return -EMSGSIZE; if (nla_put_be16(skb, MPTCP_ATTR_DPORT, issk->inet_dport)) return -EMSGSIZE; sf = mptcp_subflow_ctx(ssk); if (WARN_ON_ONCE(!sf)) return -EINVAL; if (nla_put_u8(skb, MPTCP_ATTR_LOC_ID, sf->local_id)) return -EMSGSIZE; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, sf->remote_id)) return -EMSGSIZE; return 0; } static int mptcp_event_put_token_and_ssk(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { const struct sock *sk = (const struct sock *)msk; const struct mptcp_subflow_context *sf; u8 sk_err; if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token)) return -EMSGSIZE; if (mptcp_event_add_subflow(skb, ssk)) return -EMSGSIZE; sf = mptcp_subflow_ctx(ssk); if (WARN_ON_ONCE(!sf)) return -EINVAL; if (nla_put_u8(skb, MPTCP_ATTR_BACKUP, sf->backup)) return -EMSGSIZE; if (ssk->sk_bound_dev_if && nla_put_s32(skb, MPTCP_ATTR_IF_IDX, ssk->sk_bound_dev_if)) return -EMSGSIZE; sk_err = READ_ONCE(ssk->sk_err); if (sk_err && sk->sk_state == TCP_ESTABLISHED && nla_put_u8(skb, MPTCP_ATTR_ERROR, sk_err)) return -EMSGSIZE; return 0; } static int mptcp_event_sub_established(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { return mptcp_event_put_token_and_ssk(skb, msk, ssk); } static int mptcp_event_sub_closed(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { const struct mptcp_subflow_context *sf; if (mptcp_event_put_token_and_ssk(skb, msk, ssk)) return -EMSGSIZE; sf = mptcp_subflow_ctx(ssk); if (!sf->reset_seen) return 0; if (nla_put_u32(skb, MPTCP_ATTR_RESET_REASON, sf->reset_reason)) return -EMSGSIZE; if (nla_put_u32(skb, MPTCP_ATTR_RESET_FLAGS, sf->reset_transient)) return -EMSGSIZE; return 0; } static int mptcp_event_created(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token); if (err) return err; if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) return -EMSGSIZE; return mptcp_event_add_subflow(skb, ssk); } void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id) { struct net *net = sock_net((const struct sock *)msk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, MPTCP_EVENT_REMOVED); if (!nlh) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token)) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, id)) goto nla_put_failure; genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); } void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct net *net = sock_net(ssk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, MPTCP_EVENT_ANNOUNCED); if (!nlh) goto nla_put_failure; if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token)) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id)) goto nla_put_failure; if (nla_put_be16(skb, MPTCP_ATTR_DPORT, info->port == 0 ? inet_sk(ssk)->inet_dport : info->port)) goto nla_put_failure; switch (info->family) { case AF_INET: if (nla_put_in_addr(skb, MPTCP_ATTR_DADDR4, info->addr.s_addr)) goto nla_put_failure; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &info->addr6)) goto nla_put_failure; break; #endif default: WARN_ON_ONCE(1); goto nla_put_failure; } genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, GFP_ATOMIC); return; nla_put_failure: nlmsg_free(skb); } void mptcp_event_pm_listener(const struct sock *ssk, enum mptcp_event_type event) { const struct inet_sock *issk = inet_sk(ssk); struct net *net = sock_net(ssk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, event); if (!nlh) goto nla_put_failure; if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family)) goto nla_put_failure; if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport)) goto nla_put_failure; switch (ssk->sk_family) { case AF_INET: if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr)) goto nla_put_failure; break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { const struct ipv6_pinfo *np = inet6_sk(ssk); if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) goto nla_put_failure; break; } #endif default: WARN_ON_ONCE(1); goto nla_put_failure; } genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, GFP_KERNEL); return; nla_put_failure: nlmsg_free(skb); } void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp) { struct net *net = sock_net((const struct sock *)msk); struct nlmsghdr *nlh; struct sk_buff *skb; if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) return; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); if (!skb) return; nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, type); if (!nlh) goto nla_put_failure; switch (type) { case MPTCP_EVENT_UNSPEC: WARN_ON_ONCE(1); break; case MPTCP_EVENT_CREATED: case MPTCP_EVENT_ESTABLISHED: if (mptcp_event_created(skb, msk, ssk) < 0) goto nla_put_failure; break; case MPTCP_EVENT_CLOSED: if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token) < 0) goto nla_put_failure; break; case MPTCP_EVENT_ANNOUNCED: case MPTCP_EVENT_REMOVED: /* call mptcp_event_addr_announced()/removed instead */ WARN_ON_ONCE(1); break; case MPTCP_EVENT_SUB_ESTABLISHED: case MPTCP_EVENT_SUB_PRIORITY: if (mptcp_event_sub_established(skb, msk, ssk) < 0) goto nla_put_failure; break; case MPTCP_EVENT_SUB_CLOSED: if (mptcp_event_sub_closed(skb, msk, ssk) < 0) goto nla_put_failure; break; case MPTCP_EVENT_LISTENER_CREATED: case MPTCP_EVENT_LISTENER_CLOSED: break; } genlmsg_end(skb, nlh); mptcp_nl_mcast_send(net, skb, gfp); return; nla_put_failure: nlmsg_free(skb); } static const struct genl_small_ops mptcp_pm_ops[] = { { .cmd = MPTCP_PM_CMD_ADD_ADDR, .doit = mptcp_nl_cmd_add_addr, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_DEL_ADDR, .doit = mptcp_nl_cmd_del_addr, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_FLUSH_ADDRS, .doit = mptcp_nl_cmd_flush_addrs, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_GET_ADDR, .doit = mptcp_nl_cmd_get_addr, .dumpit = mptcp_nl_cmd_dump_addrs, }, { .cmd = MPTCP_PM_CMD_SET_LIMITS, .doit = mptcp_nl_cmd_set_limits, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_GET_LIMITS, .doit = mptcp_nl_cmd_get_limits, }, { .cmd = MPTCP_PM_CMD_SET_FLAGS, .doit = mptcp_nl_cmd_set_flags, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_ANNOUNCE, .doit = mptcp_nl_cmd_announce, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_REMOVE, .doit = mptcp_nl_cmd_remove, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_SUBFLOW_CREATE, .doit = mptcp_nl_cmd_sf_create, .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_SUBFLOW_DESTROY, .doit = mptcp_nl_cmd_sf_destroy, .flags = GENL_UNS_ADMIN_PERM, }, }; static struct genl_family mptcp_genl_family __ro_after_init = { .name = MPTCP_PM_NAME, .version = MPTCP_PM_VER, .maxattr = MPTCP_PM_ATTR_MAX, .policy = mptcp_pm_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = mptcp_pm_ops, .n_small_ops = ARRAY_SIZE(mptcp_pm_ops), .resv_start_op = MPTCP_PM_CMD_SUBFLOW_DESTROY + 1, .mcgrps = mptcp_pm_mcgrps, .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps), }; static int __net_init pm_nl_init_net(struct net *net) { struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); INIT_LIST_HEAD_RCU(&pernet->local_addr_list); /* Cit. 2 subflows ought to be enough for anybody. */ pernet->subflows_max = 2; pernet->next_id = 1; pernet->stale_loss_cnt = 4; spin_lock_init(&pernet->lock); /* No need to initialize other pernet fields, the struct is zeroed at * allocation time. */ return 0; } static void __net_exit pm_nl_exit_net(struct list_head *net_list) { struct net *net; list_for_each_entry(net, net_list, exit_list) { struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); /* net is removed from namespace list, can't race with * other modifiers, also netns core already waited for a * RCU grace period. */ __flush_addrs(&pernet->local_addr_list); } } static struct pernet_operations mptcp_pm_pernet_ops = { .init = pm_nl_init_net, .exit_batch = pm_nl_exit_net, .id = &pm_nl_pernet_id, .size = sizeof(struct pm_nl_pernet), }; void __init mptcp_pm_nl_init(void) { if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) panic("Failed to register MPTCP PM pernet subsystem.\n"); if (genl_register_family(&mptcp_genl_family)) panic("Failed to register MPTCP PM netlink family\n"); }
896 898 897 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 // SPDX-License-Identifier: GPL-2.0-only /* * rcuref - A scalable reference count implementation for RCU managed objects * * rcuref is provided to replace open coded reference count implementations * based on atomic_t. It protects explicitely RCU managed objects which can * be visible even after the last reference has been dropped and the object * is heading towards destruction. * * A common usage pattern is: * * get() * rcu_read_lock(); * p = get_ptr(); * if (p && !atomic_inc_not_zero(&p->refcnt)) * p = NULL; * rcu_read_unlock(); * return p; * * put() * if (!atomic_dec_return(&->refcnt)) { * remove_ptr(p); * kfree_rcu((p, rcu); * } * * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has * O(N^2) behaviour under contention with N concurrent operations. * * rcuref uses atomic_add_negative_relaxed() for the fast path, which scales * better under contention. * * Why not refcount? * ================= * * In principle it should be possible to make refcount use the rcuref * scheme, but the destruction race described below cannot be prevented * unless the protected object is RCU managed. * * Theory of operation * =================== * * rcuref uses an unsigned integer reference counter. As long as the * counter value is greater than or equal to RCUREF_ONEREF and not larger * than RCUREF_MAXREF the reference is alive: * * ONEREF MAXREF SATURATED RELEASED DEAD NOREF * 0 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF * <---valid --------> <-------saturation zone-------> <-----dead zone-----> * * The get() and put() operations do unconditional increments and * decrements. The result is checked after the operation. This optimizes * for the fast path. * * If the reference count is saturated or dead, then the increments and * decrements are not harmful as the reference count still stays in the * respective zones and is always set back to STATURATED resp. DEAD. The * zones have room for 2^28 racing operations in each direction, which * makes it practically impossible to escape the zones. * * Once the last reference is dropped the reference count becomes * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The * slowpath then tries to set the reference count from RCUREF_NOREF to * RCUREF_DEAD via a cmpxchg(). This opens a small window where a * concurrent rcuref_get() can acquire the reference count and bring it * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD. * * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in * DEAD + 1, which is inside the dead zone. If that happens the reference * count is put back to DEAD. * * The actual race is possible due to the unconditional increment and * decrements in rcuref_get() and rcuref_put(): * * T1 T2 * get() put() * if (atomic_add_negative(-1, &ref->refcnt)) * succeeds-> atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); * * atomic_add_negative(1, &ref->refcnt); <- Elevates refcount to DEAD + 1 * * As the result of T1's add is negative, the get() goes into the slow path * and observes refcnt being in the dead zone which makes the operation fail. * * Possible critical states: * * Context Counter References Operation * T1 0 1 init() * T2 1 2 get() * T1 0 1 put() * T2 -1 0 put() tries to mark dead * T1 0 1 get() * T2 0 1 put() mark dead fails * T1 -1 0 put() tries to mark dead * T1 DEAD 0 put() mark dead succeeds * T2 DEAD+1 0 get() fails and puts it back to DEAD * * Of course there are more complex scenarios, but the above illustrates * the working principle. The rest is left to the imagination of the * reader. * * Deconstruction race * =================== * * The release operation must be protected by prohibiting a grace period in * order to prevent a possible use after free: * * T1 T2 * put() get() * // ref->refcnt = ONEREF * if (!atomic_add_negative(-1, &ref->refcnt)) * return false; <- Not taken * * // ref->refcnt == NOREF * --> preemption * // Elevates ref->refcnt to ONEREF * if (!atomic_add_negative(1, &ref->refcnt)) * return true; <- taken * * if (put(&p->ref)) { <-- Succeeds * remove_pointer(p); * kfree_rcu(p, rcu); * } * * RCU grace period ends, object is freed * * atomic_cmpxchg(&ref->refcnt, NOREF, DEAD); <- UAF * * This is prevented by disabling preemption around the put() operation as * that's in most kernel configurations cheaper than a rcu_read_lock() / * rcu_read_unlock() pair and in many cases even a NOOP. In any case it * prevents the grace period which keeps the object alive until all put() * operations complete. * * Saturation protection * ===================== * * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX). * Once this is exceedded the reference count becomes stale by setting it * to RCUREF_SATURATED, which will cause a memory leak, but it prevents * wrap arounds which obviously cause worse problems than a memory * leak. When saturation is reached a warning is emitted. * * Race conditions * =============== * * All reference count increment/decrement operations are unconditional and * only verified after the fact. This optimizes for the good case and takes * the occasional race vs. a dead or already saturated refcount into * account. The saturation and dead zones are large enough to accomodate * for that. * * Memory ordering * =============== * * Memory ordering rules are slightly relaxed wrt regular atomic_t functions * and provide only what is strictly required for refcounts. * * The increments are fully relaxed; these will not provide ordering. The * rationale is that whatever is used to obtain the object to increase the * reference count on will provide the ordering. For locked data * structures, its the lock acquire, for RCU/lockless data structures its * the dependent load. * * rcuref_get() provides a control dependency ordering future stores which * ensures that the object is not modified when acquiring a reference * fails. * * rcuref_put() provides release order, i.e. all prior loads and stores * will be issued before. It also provides a control dependency ordering * against the subsequent destruction of the object. * * If rcuref_put() successfully dropped the last reference and marked the * object DEAD it also provides acquire ordering. */ #include <linux/export.h> #include <linux/rcuref.h> /** * rcuref_get_slowpath - Slowpath of rcuref_get() * @ref: Pointer to the reference count * * Invoked when the reference count is outside of the valid zone. * * Return: * False if the reference count was already marked dead * * True if the reference count is saturated, which prevents the * object from being deconstructed ever. */ bool rcuref_get_slowpath(rcuref_t *ref) { unsigned int cnt = atomic_read(&ref->refcnt); /* * If the reference count was already marked dead, undo the * increment so it stays in the middle of the dead zone and return * fail. */ if (cnt >= RCUREF_RELEASED) { atomic_set(&ref->refcnt, RCUREF_DEAD); return false; } /* * If it was saturated, warn and mark it so. In case the increment * was already on a saturated value restore the saturation * marker. This keeps it in the middle of the saturation zone and * prevents the reference count from overflowing. This leaks the * object memory, but prevents the obvious reference count overflow * damage. */ if (WARN_ONCE(cnt > RCUREF_MAXREF, "rcuref saturated - leaking memory")) atomic_set(&ref->refcnt, RCUREF_SATURATED); return true; } EXPORT_SYMBOL_GPL(rcuref_get_slowpath); /** * rcuref_put_slowpath - Slowpath of __rcuref_put() * @ref: Pointer to the reference count * * Invoked when the reference count is outside of the valid zone. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ bool rcuref_put_slowpath(rcuref_t *ref) { unsigned int cnt = atomic_read(&ref->refcnt); /* Did this drop the last reference? */ if (likely(cnt == RCUREF_NOREF)) { /* * Carefully try to set the reference count to RCUREF_DEAD. * * This can fail if a concurrent get() operation has * elevated it again or the corresponding put() even marked * it dead already. Both are valid situations and do not * require a retry. If this fails the caller is not * allowed to deconstruct the object. */ if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF) return false; /* * The caller can safely schedule the object for * deconstruction. Provide acquire ordering. */ smp_acquire__after_ctrl_dep(); return true; } /* * If the reference count was already in the dead zone, then this * put() operation is imbalanced. Warn, put the reference count back to * DEAD and tell the caller to not deconstruct the object. */ if (WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()")) { atomic_set(&ref->refcnt, RCUREF_DEAD); return false; } /* * This is a put() operation on a saturated refcount. Restore the * mean saturation value and tell the caller to not deconstruct the * object. */ if (cnt > RCUREF_MAXREF) atomic_set(&ref->refcnt, RCUREF_SATURATED); return false; } EXPORT_SYMBOL_GPL(rcuref_put_slowpath);
1814 421 625 625 3318 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright 2019 Google LLC */ #ifndef __LINUX_BLK_CRYPTO_INTERNAL_H #define __LINUX_BLK_CRYPTO_INTERNAL_H #include <linux/bio.h> #include <linux/blk-mq.h> /* Represents a crypto mode supported by blk-crypto */ struct blk_crypto_mode { const char *name; /* name of this mode, shown in sysfs */ const char *cipher_str; /* crypto API name (for fallback case) */ unsigned int keysize; /* key size in bytes */ unsigned int ivsize; /* iv size in bytes */ }; extern const struct blk_crypto_mode blk_crypto_modes[]; #ifdef CONFIG_BLK_INLINE_ENCRYPTION int blk_crypto_sysfs_register(struct gendisk *disk); void blk_crypto_sysfs_unregister(struct gendisk *disk); void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], unsigned int inc); bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio); bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, struct bio_crypt_ctx *bc2); static inline bool bio_crypt_ctx_back_mergeable(struct request *req, struct bio *bio) { return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), bio->bi_crypt_context); } static inline bool bio_crypt_ctx_front_mergeable(struct request *req, struct bio *bio) { return bio_crypt_ctx_mergeable(bio->bi_crypt_context, bio->bi_iter.bi_size, req->crypt_ctx); } static inline bool bio_crypt_ctx_merge_rq(struct request *req, struct request *next) { return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), next->crypt_ctx); } static inline void blk_crypto_rq_set_defaults(struct request *rq) { rq->crypt_ctx = NULL; rq->crypt_keyslot = NULL; } static inline bool blk_crypto_rq_is_encrypted(struct request *rq) { return rq->crypt_ctx; } static inline bool blk_crypto_rq_has_keyslot(struct request *rq) { return rq->crypt_keyslot; } blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, const struct blk_crypto_key *key, struct blk_crypto_keyslot **slot_ptr); void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot); int __blk_crypto_evict_key(struct blk_crypto_profile *profile, const struct blk_crypto_key *key); bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, const struct blk_crypto_config *cfg); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline int blk_crypto_sysfs_register(struct gendisk *disk) { return 0; } static inline void blk_crypto_sysfs_unregister(struct gendisk *disk) { } static inline bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) { return true; } static inline bool bio_crypt_ctx_front_mergeable(struct request *req, struct bio *bio) { return true; } static inline bool bio_crypt_ctx_back_mergeable(struct request *req, struct bio *bio) { return true; } static inline bool bio_crypt_ctx_merge_rq(struct request *req, struct request *next) { return true; } static inline void blk_crypto_rq_set_defaults(struct request *rq) { } static inline bool blk_crypto_rq_is_encrypted(struct request *rq) { return false; } static inline bool blk_crypto_rq_has_keyslot(struct request *rq) { return false; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); static inline void bio_crypt_advance(struct bio *bio, unsigned int bytes) { if (bio_has_crypt_ctx(bio)) __bio_crypt_advance(bio, bytes); } void __bio_crypt_free_ctx(struct bio *bio); static inline void bio_crypt_free_ctx(struct bio *bio) { if (bio_has_crypt_ctx(bio)) __bio_crypt_free_ctx(bio); } static inline void bio_crypt_do_front_merge(struct request *rq, struct bio *bio) { #ifdef CONFIG_BLK_INLINE_ENCRYPTION if (bio_has_crypt_ctx(bio)) memcpy(rq->crypt_ctx->bc_dun, bio->bi_crypt_context->bc_dun, sizeof(rq->crypt_ctx->bc_dun)); #endif } bool __blk_crypto_bio_prep(struct bio **bio_ptr); static inline bool blk_crypto_bio_prep(struct bio **bio_ptr) { if (bio_has_crypt_ctx(*bio_ptr)) return __blk_crypto_bio_prep(bio_ptr); return true; } blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq); static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq) { if (blk_crypto_rq_is_encrypted(rq)) return __blk_crypto_rq_get_keyslot(rq); return BLK_STS_OK; } void __blk_crypto_rq_put_keyslot(struct request *rq); static inline void blk_crypto_rq_put_keyslot(struct request *rq) { if (blk_crypto_rq_has_keyslot(rq)) __blk_crypto_rq_put_keyslot(rq); } void __blk_crypto_free_request(struct request *rq); static inline void blk_crypto_free_request(struct request *rq) { if (blk_crypto_rq_is_encrypted(rq)) __blk_crypto_free_request(rq); } int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask); /** * blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio * is inserted * @rq: The request to prepare * @bio: The first bio being inserted into the request * @gfp_mask: Memory allocation flags * * Return: 0 on success, -ENOMEM if out of memory. -ENOMEM is only possible if * @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM. */ static inline int blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, gfp_t gfp_mask) { if (bio_has_crypt_ctx(bio)) return __blk_crypto_rq_bio_prep(rq, bio, gfp_mask); return 0; } #ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num); bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr); int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key); #else /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ static inline int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) { pr_warn_once("crypto API fallback is disabled\n"); return -ENOPKG; } static inline bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) { pr_warn_once("crypto API fallback disabled; failing request.\n"); (*bio_ptr)->bi_status = BLK_STS_NOTSUPP; return false; } static inline int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) { return 0; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ #endif /* __LINUX_BLK_CRYPTO_INTERNAL_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtio balloon implementation, inspired by Dor Laor and Marcelo * Tosatti's implementations. * * Copyright 2008 Rusty Russell IBM Corporation */ #include <linux/virtio.h> #include <linux/virtio_balloon.h> #include <linux/swap.h> #include <linux/workqueue.h> #include <linux/delay.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/balloon_compaction.h> #include <linux/oom.h> #include <linux/wait.h> #include <linux/mm.h> #include <linux/page_reporting.h> /* * Balloon device works in 4K page units. So each page is pointed to by * multiple balloon pages. All memory counters in this driver are in balloon * page units. */ #define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned int)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT) #define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256 /* Maximum number of (4k) pages to deflate on OOM notifications. */ #define VIRTIO_BALLOON_OOM_NR_PAGES 256 #define VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY 80 #define VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG (__GFP_NORETRY | __GFP_NOWARN | \ __GFP_NOMEMALLOC) /* The order of free page blocks to report to host */ #define VIRTIO_BALLOON_HINT_BLOCK_ORDER MAX_ORDER /* The size of a free page block in bytes */ #define VIRTIO_BALLOON_HINT_BLOCK_BYTES \ (1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT)) #define VIRTIO_BALLOON_HINT_BLOCK_PAGES (1 << VIRTIO_BALLOON_HINT_BLOCK_ORDER) enum virtio_balloon_vq { VIRTIO_BALLOON_VQ_INFLATE, VIRTIO_BALLOON_VQ_DEFLATE, VIRTIO_BALLOON_VQ_STATS, VIRTIO_BALLOON_VQ_FREE_PAGE, VIRTIO_BALLOON_VQ_REPORTING, VIRTIO_BALLOON_VQ_MAX }; enum virtio_balloon_config_read { VIRTIO_BALLOON_CONFIG_READ_CMD_ID = 0, }; struct virtio_balloon { struct virtio_device *vdev; struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq; /* Balloon's own wq for cpu-intensive work items */ struct workqueue_struct *balloon_wq; /* The free page reporting work item submitted to the balloon wq */ struct work_struct report_free_page_work; /* The balloon servicing is delegated to a freezable workqueue. */ struct work_struct update_balloon_stats_work; struct work_struct update_balloon_size_work; /* Prevent updating balloon when it is being canceled. */ spinlock_t stop_update_lock; bool stop_update; /* Bitmap to indicate if reading the related config fields are needed */ unsigned long config_read_bitmap; /* The list of allocated free pages, waiting to be given back to mm */ struct list_head free_page_list; spinlock_t free_page_list_lock; /* The number of free page blocks on the above list */ unsigned long num_free_page_blocks; /* * The cmd id received from host. * Read it via virtio_balloon_cmd_id_received to get the latest value * sent from host. */ u32 cmd_id_received_cache; /* The cmd id that is actively in use */ __virtio32 cmd_id_active; /* Buffer to store the stop sign */ __virtio32 cmd_id_stop; /* Waiting for host to ack the pages we released. */ wait_queue_head_t acked; /* Number of balloon pages we've told the Host we're not using. */ unsigned int num_pages; /* * The pages we've told the Host we're not using are enqueued * at vb_dev_info->pages list. * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE * to num_pages above. */ struct balloon_dev_info vb_dev_info; /* Synchronize access/update to this struct virtio_balloon elements */ struct mutex balloon_lock; /* The array of pfns we tell the Host about. */ unsigned int num_pfns; __virtio32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX]; /* Memory statistics */ struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR]; /* Shrinker to return free pages - VIRTIO_BALLOON_F_FREE_PAGE_HINT */ struct shrinker shrinker; /* OOM notifier to deflate on OOM - VIRTIO_BALLOON_F_DEFLATE_ON_OOM */ struct notifier_block oom_nb; /* Free page reporting device */ struct virtqueue *reporting_vq; struct page_reporting_dev_info pr_dev_info; }; static const struct virtio_device_id id_table[] = { { VIRTIO_ID_BALLOON, VIRTIO_DEV_ANY_ID }, { 0 }, }; static u32 page_to_balloon_pfn(struct page *page) { unsigned long pfn = page_to_pfn(page); BUILD_BUG_ON(PAGE_SHIFT < VIRTIO_BALLOON_PFN_SHIFT); /* Convert pfn from Linux page size to balloon page size. */ return pfn * VIRTIO_BALLOON_PAGES_PER_PAGE; } static void balloon_ack(struct virtqueue *vq) { struct virtio_balloon *vb = vq->vdev->priv; wake_up(&vb->acked); } static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) { struct scatterlist sg; unsigned int len; sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns); /* We should always be able to add one buffer to an empty queue. */ virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL); virtqueue_kick(vq); /* When host has read buffer, this completes via balloon_ack */ wait_event(vb->acked, virtqueue_get_buf(vq, &len)); } static int virtballoon_free_page_report(struct page_reporting_dev_info *pr_dev_info, struct scatterlist *sg, unsigned int nents) { struct virtio_balloon *vb = container_of(pr_dev_info, struct virtio_balloon, pr_dev_info); struct virtqueue *vq = vb->reporting_vq; unsigned int unused, err; /* We should always be able to add these buffers to an empty queue. */ err = virtqueue_add_inbuf(vq, sg, nents, vb, GFP_NOWAIT | __GFP_NOWARN); /* * In the extremely unlikely case that something has occurred and we * are able to trigger an error we will simply display a warning * and exit without actually processing the pages. */ if (WARN_ON_ONCE(err)) return err; virtqueue_kick(vq); /* When host has read buffer, this completes via balloon_ack */ wait_event(vb->acked, virtqueue_get_buf(vq, &unused)); return 0; } static void set_page_pfns(struct virtio_balloon *vb, __virtio32 pfns[], struct page *page) { unsigned int i; BUILD_BUG_ON(VIRTIO_BALLOON_PAGES_PER_PAGE > VIRTIO_BALLOON_ARRAY_PFNS_MAX); /* * Set balloon pfns pointing at this page. * Note that the first pfn points at start of the page. */ for (i = 0; i < VIRTIO_BALLOON_PAGES_PER_PAGE; i++) pfns[i] = cpu_to_virtio32(vb->vdev, page_to_balloon_pfn(page) + i); } static unsigned int fill_balloon(struct virtio_balloon *vb, size_t num) { unsigned int num_allocated_pages; unsigned int num_pfns; struct page *page; LIST_HEAD(pages); /* We can only do one array worth at a time. */ num = min(num, ARRAY_SIZE(vb->pfns)); for (num_pfns = 0; num_pfns < num; num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { struct page *page = balloon_page_alloc(); if (!page) { dev_info_ratelimited(&vb->vdev->dev, "Out of puff! Can't get %u pages\n", VIRTIO_BALLOON_PAGES_PER_PAGE); /* Sleep for at least 1/5 of a second before retry. */ msleep(200); break; } balloon_page_push(&pages, page); } mutex_lock(&vb->balloon_lock); vb->num_pfns = 0; while ((page = balloon_page_pop(&pages))) { balloon_page_enqueue(&vb->vb_dev_info, page); set_page_pfns(vb, vb->pfns + vb->num_pfns, page); vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) adjust_managed_page_count(page, -1); vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; } num_allocated_pages = vb->num_pfns; /* Did we get any? */ if (vb->num_pfns != 0) tell_host(vb, vb->inflate_vq); mutex_unlock(&vb->balloon_lock); return num_allocated_pages; } static void release_pages_balloon(struct virtio_balloon *vb, struct list_head *pages) { struct page *page, *next; list_for_each_entry_safe(page, next, pages, lru) { if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) adjust_managed_page_count(page, 1); list_del(&page->lru); put_page(page); /* balloon reference */ } } static unsigned int leak_balloon(struct virtio_balloon *vb, size_t num) { unsigned int num_freed_pages; struct page *page; struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; LIST_HEAD(pages); /* We can only do one array worth at a time. */ num = min(num, ARRAY_SIZE(vb->pfns)); mutex_lock(&vb->balloon_lock); /* We can't release more pages than taken */ num = min(num, (size_t)vb->num_pages); for (vb->num_pfns = 0; vb->num_pfns < num; vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { page = balloon_page_dequeue(vb_dev_info); if (!page) break; set_page_pfns(vb, vb->pfns + vb->num_pfns, page); list_add(&page->lru, &pages); vb->num_pages -= VIRTIO_BALLOON_PAGES_PER_PAGE; } num_freed_pages = vb->num_pfns; /* * Note that if * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST); * is true, we *have* to do it in this order */ if (vb->num_pfns != 0) tell_host(vb, vb->deflate_vq); release_pages_balloon(vb, &pages); mutex_unlock(&vb->balloon_lock); return num_freed_pages; } static inline void update_stat(struct virtio_balloon *vb, int idx, u16 tag, u64 val) { BUG_ON(idx >= VIRTIO_BALLOON_S_NR); vb->stats[idx].tag = cpu_to_virtio16(vb->vdev, tag); vb->stats[idx].val = cpu_to_virtio64(vb->vdev, val); } #define pages_to_bytes(x) ((u64)(x) << PAGE_SHIFT) static unsigned int update_balloon_stats(struct virtio_balloon *vb) { unsigned long events[NR_VM_EVENT_ITEMS]; struct sysinfo i; unsigned int idx = 0; long available; unsigned long caches; all_vm_events(events); si_meminfo(&i); available = si_mem_available(); caches = global_node_page_state(NR_FILE_PAGES); #ifdef CONFIG_VM_EVENT_COUNTERS update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN, pages_to_bytes(events[PSWPIN])); update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT, pages_to_bytes(events[PSWPOUT])); update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]); update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]); #ifdef CONFIG_HUGETLB_PAGE update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, events[HTLB_BUDDY_PGALLOC]); update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL, events[HTLB_BUDDY_PGALLOC_FAIL]); #endif #endif update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE, pages_to_bytes(i.freeram)); update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT, pages_to_bytes(i.totalram)); update_stat(vb, idx++, VIRTIO_BALLOON_S_AVAIL, pages_to_bytes(available)); update_stat(vb, idx++, VIRTIO_BALLOON_S_CACHES, pages_to_bytes(caches)); return idx; } /* * While most virtqueues communicate guest-initiated requests to the hypervisor, * the stats queue operates in reverse. The driver initializes the virtqueue * with a single buffer. From that point forward, all conversations consist of * a hypervisor request (a call to this function) which directs us to refill * the virtqueue with a fresh stats buffer. Since stats collection can sleep, * we delegate the job to a freezable workqueue that will do the actual work via * stats_handle_request(). */ static void stats_request(struct virtqueue *vq) { struct virtio_balloon *vb = vq->vdev->priv; spin_lock(&vb->stop_update_lock); if (!vb->stop_update) queue_work(system_freezable_wq, &vb->update_balloon_stats_work); spin_unlock(&vb->stop_update_lock); } static void stats_handle_request(struct virtio_balloon *vb) { struct virtqueue *vq; struct scatterlist sg; unsigned int len, num_stats; num_stats = update_balloon_stats(vb); vq = vb->stats_vq; if (!virtqueue_get_buf(vq, &len)) return; sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats); virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL); virtqueue_kick(vq); } static inline s64 towards_target(struct virtio_balloon *vb) { s64 target; u32 num_pages; /* Legacy balloon config space is LE, unlike all other devices. */ virtio_cread_le(vb->vdev, struct virtio_balloon_config, num_pages, &num_pages); target = num_pages; return target - vb->num_pages; } /* Gives back @num_to_return blocks of free pages to mm. */ static unsigned long return_free_pages_to_mm(struct virtio_balloon *vb, unsigned long num_to_return) { struct page *page; unsigned long num_returned; spin_lock_irq(&vb->free_page_list_lock); for (num_returned = 0; num_returned < num_to_return; num_returned++) { page = balloon_page_pop(&vb->free_page_list); if (!page) break; free_pages((unsigned long)page_address(page), VIRTIO_BALLOON_HINT_BLOCK_ORDER); } vb->num_free_page_blocks -= num_returned; spin_unlock_irq(&vb->free_page_list_lock); return num_returned; } static void virtio_balloon_queue_free_page_work(struct virtio_balloon *vb) { if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) return; /* No need to queue the work if the bit was already set. */ if (test_and_set_bit(VIRTIO_BALLOON_CONFIG_READ_CMD_ID, &vb->config_read_bitmap)) return; queue_work(vb->balloon_wq, &vb->report_free_page_work); } static void virtballoon_changed(struct virtio_device *vdev) { struct virtio_balloon *vb = vdev->priv; unsigned long flags; spin_lock_irqsave(&vb->stop_update_lock, flags); if (!vb->stop_update) { queue_work(system_freezable_wq, &vb->update_balloon_size_work); virtio_balloon_queue_free_page_work(vb); } spin_unlock_irqrestore(&vb->stop_update_lock, flags); } static void update_balloon_size(struct virtio_balloon *vb) { u32 actual = vb->num_pages; /* Legacy balloon config space is LE, unlike all other devices. */ virtio_cwrite_le(vb->vdev, struct virtio_balloon_config, actual, &actual); } static void update_balloon_stats_func(struct work_struct *work) { struct virtio_balloon *vb; vb = container_of(work, struct virtio_balloon, update_balloon_stats_work); stats_handle_request(vb); } static void update_balloon_size_func(struct work_struct *work) { struct virtio_balloon *vb; s64 diff; vb = container_of(work, struct virtio_balloon, update_balloon_size_work); diff = towards_target(vb); if (!diff) return; if (diff > 0) diff -= fill_balloon(vb, diff); else diff += leak_balloon(vb, -diff); update_balloon_size(vb); if (diff) queue_work(system_freezable_wq, work); } static int init_vqs(struct virtio_balloon *vb) { struct virtqueue *vqs[VIRTIO_BALLOON_VQ_MAX]; vq_callback_t *callbacks[VIRTIO_BALLOON_VQ_MAX]; const char *names[VIRTIO_BALLOON_VQ_MAX]; int err; /* * Inflateq and deflateq are used unconditionally. The names[] * will be NULL if the related feature is not enabled, which will * cause no allocation for the corresponding virtqueue in find_vqs. */ callbacks[VIRTIO_BALLOON_VQ_INFLATE] = balloon_ack; names[VIRTIO_BALLOON_VQ_INFLATE] = "inflate"; callbacks[VIRTIO_BALLOON_VQ_DEFLATE] = balloon_ack; names[VIRTIO_BALLOON_VQ_DEFLATE] = "deflate"; callbacks[VIRTIO_BALLOON_VQ_STATS] = NULL; names[VIRTIO_BALLOON_VQ_STATS] = NULL; callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; names[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; names[VIRTIO_BALLOON_VQ_REPORTING] = NULL; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { names[VIRTIO_BALLOON_VQ_STATS] = "stats"; callbacks[VIRTIO_BALLOON_VQ_STATS] = stats_request; } if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { names[VIRTIO_BALLOON_VQ_FREE_PAGE] = "free_page_vq"; callbacks[VIRTIO_BALLOON_VQ_FREE_PAGE] = NULL; } if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) { names[VIRTIO_BALLOON_VQ_REPORTING] = "reporting_vq"; callbacks[VIRTIO_BALLOON_VQ_REPORTING] = balloon_ack; } err = virtio_find_vqs(vb->vdev, VIRTIO_BALLOON_VQ_MAX, vqs, callbacks, names, NULL); if (err) return err; vb->inflate_vq = vqs[VIRTIO_BALLOON_VQ_INFLATE]; vb->deflate_vq = vqs[VIRTIO_BALLOON_VQ_DEFLATE]; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) { struct scatterlist sg; unsigned int num_stats; vb->stats_vq = vqs[VIRTIO_BALLOON_VQ_STATS]; /* * Prime this virtqueue with one buffer so the hypervisor can * use it to signal us later (it can't be broken yet!). */ num_stats = update_balloon_stats(vb); sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats); err = virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL); if (err) { dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n", __func__); return err; } virtqueue_kick(vb->stats_vq); } if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) vb->free_page_vq = vqs[VIRTIO_BALLOON_VQ_FREE_PAGE]; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) vb->reporting_vq = vqs[VIRTIO_BALLOON_VQ_REPORTING]; return 0; } static u32 virtio_balloon_cmd_id_received(struct virtio_balloon *vb) { if (test_and_clear_bit(VIRTIO_BALLOON_CONFIG_READ_CMD_ID, &vb->config_read_bitmap)) { /* Legacy balloon config space is LE, unlike all other devices. */ virtio_cread_le(vb->vdev, struct virtio_balloon_config, free_page_hint_cmd_id, &vb->cmd_id_received_cache); } return vb->cmd_id_received_cache; } static int send_cmd_id_start(struct virtio_balloon *vb) { struct scatterlist sg; struct virtqueue *vq = vb->free_page_vq; int err, unused; /* Detach all the used buffers from the vq */ while (virtqueue_get_buf(vq, &unused)) ; vb->cmd_id_active = cpu_to_virtio32(vb->vdev, virtio_balloon_cmd_id_received(vb)); sg_init_one(&sg, &vb->cmd_id_active, sizeof(vb->cmd_id_active)); err = virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_id_active, GFP_KERNEL); if (!err) virtqueue_kick(vq); return err; } static int send_cmd_id_stop(struct virtio_balloon *vb) { struct scatterlist sg; struct virtqueue *vq = vb->free_page_vq; int err, unused; /* Detach all the used buffers from the vq */ while (virtqueue_get_buf(vq, &unused)) ; sg_init_one(&sg, &vb->cmd_id_stop, sizeof(vb->cmd_id_stop)); err = virtqueue_add_outbuf(vq, &sg, 1, &vb->cmd_id_stop, GFP_KERNEL); if (!err) virtqueue_kick(vq); return err; } static int get_free_page_and_send(struct virtio_balloon *vb) { struct virtqueue *vq = vb->free_page_vq; struct page *page; struct scatterlist sg; int err, unused; void *p; /* Detach all the used buffers from the vq */ while (virtqueue_get_buf(vq, &unused)) ; page = alloc_pages(VIRTIO_BALLOON_FREE_PAGE_ALLOC_FLAG, VIRTIO_BALLOON_HINT_BLOCK_ORDER); /* * When the allocation returns NULL, it indicates that we have got all * the possible free pages, so return -EINTR to stop. */ if (!page) return -EINTR; p = page_address(page); sg_init_one(&sg, p, VIRTIO_BALLOON_HINT_BLOCK_BYTES); /* There is always 1 entry reserved for the cmd id to use. */ if (vq->num_free > 1) { err = virtqueue_add_inbuf(vq, &sg, 1, p, GFP_KERNEL); if (unlikely(err)) { free_pages((unsigned long)p, VIRTIO_BALLOON_HINT_BLOCK_ORDER); return err; } virtqueue_kick(vq); spin_lock_irq(&vb->free_page_list_lock); balloon_page_push(&vb->free_page_list, page); vb->num_free_page_blocks++; spin_unlock_irq(&vb->free_page_list_lock); } else { /* * The vq has no available entry to add this page block, so * just free it. */ free_pages((unsigned long)p, VIRTIO_BALLOON_HINT_BLOCK_ORDER); } return 0; } static int send_free_pages(struct virtio_balloon *vb) { int err; u32 cmd_id_active; while (1) { /* * If a stop id or a new cmd id was just received from host, * stop the reporting. */ cmd_id_active = virtio32_to_cpu(vb->vdev, vb->cmd_id_active); if (unlikely(cmd_id_active != virtio_balloon_cmd_id_received(vb))) break; /* * The free page blocks are allocated and sent to host one by * one. */ err = get_free_page_and_send(vb); if (err == -EINTR) break; else if (unlikely(err)) return err; } return 0; } static void virtio_balloon_report_free_page(struct virtio_balloon *vb) { int err; struct device *dev = &vb->vdev->dev; /* Start by sending the received cmd id to host with an outbuf. */ err = send_cmd_id_start(vb); if (unlikely(err)) dev_err(dev, "Failed to send a start id, err = %d\n", err); err = send_free_pages(vb); if (unlikely(err)) dev_err(dev, "Failed to send a free page, err = %d\n", err); /* End by sending a stop id to host with an outbuf. */ err = send_cmd_id_stop(vb); if (unlikely(err)) dev_err(dev, "Failed to send a stop id, err = %d\n", err); } static void report_free_page_func(struct work_struct *work) { struct virtio_balloon *vb = container_of(work, struct virtio_balloon, report_free_page_work); u32 cmd_id_received; cmd_id_received = virtio_balloon_cmd_id_received(vb); if (cmd_id_received == VIRTIO_BALLOON_CMD_ID_DONE) { /* Pass ULONG_MAX to give back all the free pages */ return_free_pages_to_mm(vb, ULONG_MAX); } else if (cmd_id_received != VIRTIO_BALLOON_CMD_ID_STOP && cmd_id_received != virtio32_to_cpu(vb->vdev, vb->cmd_id_active)) { virtio_balloon_report_free_page(vb); } } #ifdef CONFIG_BALLOON_COMPACTION /* * virtballoon_migratepage - perform the balloon page migration on behalf of * a compaction thread. (called under page lock) * @vb_dev_info: the balloon device * @newpage: page that will replace the isolated page after migration finishes. * @page : the isolated (old) page that is about to be migrated to newpage. * @mode : compaction mode -- not used for balloon page migration. * * After a ballooned page gets isolated by compaction procedures, this is the * function that performs the page migration on behalf of a compaction thread * The page migration for virtio balloon is done in a simple swap fashion which * follows these two macro steps: * 1) insert newpage into vb->pages list and update the host about it; * 2) update the host about the old page removed from vb->pages list; * * This function preforms the balloon page migration task. * Called through balloon_mapping->a_ops->migratepage */ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, struct page *newpage, struct page *page, enum migrate_mode mode) { struct virtio_balloon *vb = container_of(vb_dev_info, struct virtio_balloon, vb_dev_info); unsigned long flags; /* * In order to avoid lock contention while migrating pages concurrently * to leak_balloon() or fill_balloon() we just give up the balloon_lock * this turn, as it is easier to retry the page migration later. * This also prevents fill_balloon() getting stuck into a mutex * recursion in the case it ends up triggering memory compaction * while it is attempting to inflate the ballon. */ if (!mutex_trylock(&vb->balloon_lock)) return -EAGAIN; get_page(newpage); /* balloon reference */ /* * When we migrate a page to a different zone and adjusted the * managed page count when inflating, we have to fixup the count of * both involved zones. */ if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM) && page_zone(page) != page_zone(newpage)) { adjust_managed_page_count(page, 1); adjust_managed_page_count(newpage, -1); } /* balloon's page migration 1st step -- inflate "newpage" */ spin_lock_irqsave(&vb_dev_info->pages_lock, flags); balloon_page_insert(vb_dev_info, newpage); vb_dev_info->isolated_pages--; __count_vm_event(BALLOON_MIGRATE); spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; set_page_pfns(vb, vb->pfns, newpage); tell_host(vb, vb->inflate_vq); /* balloon's page migration 2nd step -- deflate "page" */ spin_lock_irqsave(&vb_dev_info->pages_lock, flags); balloon_page_delete(page); spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; set_page_pfns(vb, vb->pfns, page); tell_host(vb, vb->deflate_vq); mutex_unlock(&vb->balloon_lock); put_page(page); /* balloon reference */ return MIGRATEPAGE_SUCCESS; } #endif /* CONFIG_BALLOON_COMPACTION */ static unsigned long shrink_free_pages(struct virtio_balloon *vb, unsigned long pages_to_free) { unsigned long blocks_to_free, blocks_freed; pages_to_free = round_up(pages_to_free, VIRTIO_BALLOON_HINT_BLOCK_PAGES); blocks_to_free = pages_to_free / VIRTIO_BALLOON_HINT_BLOCK_PAGES; blocks_freed = return_free_pages_to_mm(vb, blocks_to_free); return blocks_freed * VIRTIO_BALLOON_HINT_BLOCK_PAGES; } static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { struct virtio_balloon *vb = container_of(shrinker, struct virtio_balloon, shrinker); return shrink_free_pages(vb, sc->nr_to_scan); } static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { struct virtio_balloon *vb = container_of(shrinker, struct virtio_balloon, shrinker); return vb->num_free_page_blocks * VIRTIO_BALLOON_HINT_BLOCK_PAGES; } static int virtio_balloon_oom_notify(struct notifier_block *nb, unsigned long dummy, void *parm) { struct virtio_balloon *vb = container_of(nb, struct virtio_balloon, oom_nb); unsigned long *freed = parm; *freed += leak_balloon(vb, VIRTIO_BALLOON_OOM_NR_PAGES) / VIRTIO_BALLOON_PAGES_PER_PAGE; update_balloon_size(vb); return NOTIFY_OK; } static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb) { unregister_shrinker(&vb->shrinker); } static int virtio_balloon_register_shrinker(struct virtio_balloon *vb) { vb->shrinker.scan_objects = virtio_balloon_shrinker_scan; vb->shrinker.count_objects = virtio_balloon_shrinker_count; vb->shrinker.seeks = DEFAULT_SEEKS; return register_shrinker(&vb->shrinker, "virtio-balloon"); } static int virtballoon_probe(struct virtio_device *vdev) { struct virtio_balloon *vb; int err; if (!vdev->config->get) { dev_err(&vdev->dev, "%s failure: config access disabled\n", __func__); return -EINVAL; } vdev->priv = vb = kzalloc(sizeof(*vb), GFP_KERNEL); if (!vb) { err = -ENOMEM; goto out; } INIT_WORK(&vb->update_balloon_stats_work, update_balloon_stats_func); INIT_WORK(&vb->update_balloon_size_work, update_balloon_size_func); spin_lock_init(&vb->stop_update_lock); mutex_init(&vb->balloon_lock); init_waitqueue_head(&vb->acked); vb->vdev = vdev; balloon_devinfo_init(&vb->vb_dev_info); err = init_vqs(vb); if (err) goto out_free_vb; #ifdef CONFIG_BALLOON_COMPACTION vb->vb_dev_info.migratepage = virtballoon_migratepage; #endif if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { /* * There is always one entry reserved for cmd id, so the ring * size needs to be at least two to report free page hints. */ if (virtqueue_get_vring_size(vb->free_page_vq) < 2) { err = -ENOSPC; goto out_del_vqs; } vb->balloon_wq = alloc_workqueue("balloon-wq", WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0); if (!vb->balloon_wq) { err = -ENOMEM; goto out_del_vqs; } INIT_WORK(&vb->report_free_page_work, report_free_page_func); vb->cmd_id_received_cache = VIRTIO_BALLOON_CMD_ID_STOP; vb->cmd_id_active = cpu_to_virtio32(vb->vdev, VIRTIO_BALLOON_CMD_ID_STOP); vb->cmd_id_stop = cpu_to_virtio32(vb->vdev, VIRTIO_BALLOON_CMD_ID_STOP); spin_lock_init(&vb->free_page_list_lock); INIT_LIST_HEAD(&vb->free_page_list); /* * We're allowed to reuse any free pages, even if they are * still to be processed by the host. */ err = virtio_balloon_register_shrinker(vb); if (err) goto out_del_balloon_wq; } if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) { vb->oom_nb.notifier_call = virtio_balloon_oom_notify; vb->oom_nb.priority = VIRTIO_BALLOON_OOM_NOTIFY_PRIORITY; err = register_oom_notifier(&vb->oom_nb); if (err < 0) goto out_unregister_shrinker; } if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) { /* Start with poison val of 0 representing general init */ __u32 poison_val = 0; /* * Let the hypervisor know that we are expecting a * specific value to be written back in balloon pages. * * If the PAGE_POISON value was larger than a byte we would * need to byte swap poison_val here to guarantee it is * little-endian. However for now it is a single byte so we * can pass it as-is. */ if (!want_init_on_free()) memset(&poison_val, PAGE_POISON, sizeof(poison_val)); virtio_cwrite_le(vb->vdev, struct virtio_balloon_config, poison_val, &poison_val); } vb->pr_dev_info.report = virtballoon_free_page_report; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) { unsigned int capacity; capacity = virtqueue_get_vring_size(vb->reporting_vq); if (capacity < PAGE_REPORTING_CAPACITY) { err = -ENOSPC; goto out_unregister_oom; } /* * The default page reporting order is @pageblock_order, which * corresponds to 512MB in size on ARM64 when 64KB base page * size is used. The page reporting won't be triggered if the * freeing page can't come up with a free area like that huge. * So we specify the page reporting order to 5, corresponding * to 2MB. It helps to avoid THP splitting if 4KB base page * size is used by host. * * Ideally, the page reporting order is selected based on the * host's base page size. However, it needs more work to report * that value. The hard-coded order would be fine currently. */ #if defined(CONFIG_ARM64) && defined(CONFIG_ARM64_64K_PAGES) vb->pr_dev_info.order = 5; #endif err = page_reporting_register(&vb->pr_dev_info); if (err) goto out_unregister_oom; } virtio_device_ready(vdev); if (towards_target(vb)) virtballoon_changed(vdev); return 0; out_unregister_oom: if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) unregister_oom_notifier(&vb->oom_nb); out_unregister_shrinker: if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) virtio_balloon_unregister_shrinker(vb); out_del_balloon_wq: if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) destroy_workqueue(vb->balloon_wq); out_del_vqs: vdev->config->del_vqs(vdev); out_free_vb: kfree(vb); out: return err; } static void remove_common(struct virtio_balloon *vb) { /* There might be pages left in the balloon: free them. */ while (vb->num_pages) leak_balloon(vb, vb->num_pages); update_balloon_size(vb); /* There might be free pages that are being reported: release them. */ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) return_free_pages_to_mm(vb, ULONG_MAX); /* Now we reset the device so we can clean up the queues. */ virtio_reset_device(vb->vdev); vb->vdev->config->del_vqs(vb->vdev); } static void virtballoon_remove(struct virtio_device *vdev) { struct virtio_balloon *vb = vdev->priv; if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING)) page_reporting_unregister(&vb->pr_dev_info); if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) unregister_oom_notifier(&vb->oom_nb); if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) virtio_balloon_unregister_shrinker(vb); spin_lock_irq(&vb->stop_update_lock); vb->stop_update = true; spin_unlock_irq(&vb->stop_update_lock); cancel_work_sync(&vb->update_balloon_size_work); cancel_work_sync(&vb->update_balloon_stats_work); if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { cancel_work_sync(&vb->report_free_page_work); destroy_workqueue(vb->balloon_wq); } remove_common(vb); kfree(vb); } #ifdef CONFIG_PM_SLEEP static int virtballoon_freeze(struct virtio_device *vdev) { struct virtio_balloon *vb = vdev->priv; /* * The workqueue is already frozen by the PM core before this * function is called. */ remove_common(vb); return 0; } static int virtballoon_restore(struct virtio_device *vdev) { struct virtio_balloon *vb = vdev->priv; int ret; ret = init_vqs(vdev->priv); if (ret) return ret; virtio_device_ready(vdev); if (towards_target(vb)) virtballoon_changed(vdev); update_balloon_size(vb); return 0; } #endif static int virtballoon_validate(struct virtio_device *vdev) { /* * Inform the hypervisor that our pages are poisoned or * initialized. If we cannot do that then we should disable * page reporting as it could potentially change the contents * of our free pages. */ if (!want_init_on_free() && !page_poisoning_enabled_static()) __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON); else if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_REPORTING); __virtio_clear_bit(vdev, VIRTIO_F_ACCESS_PLATFORM); return 0; } static unsigned int features[] = { VIRTIO_BALLOON_F_MUST_TELL_HOST, VIRTIO_BALLOON_F_STATS_VQ, VIRTIO_BALLOON_F_DEFLATE_ON_OOM, VIRTIO_BALLOON_F_FREE_PAGE_HINT, VIRTIO_BALLOON_F_PAGE_POISON, VIRTIO_BALLOON_F_REPORTING, }; static struct virtio_driver virtio_balloon_driver = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, .validate = virtballoon_validate, .probe = virtballoon_probe, .remove = virtballoon_remove, .config_changed = virtballoon_changed, #ifdef CONFIG_PM_SLEEP .freeze = virtballoon_freeze, .restore = virtballoon_restore, #endif }; module_virtio_driver(virtio_balloon_driver); MODULE_DEVICE_TABLE(virtio, id_table); MODULE_DESCRIPTION("Virtio balloon driver"); MODULE_LICENSE("GPL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 /* SPDX-License-Identifier: GPL-2.0+ */ /* * MACsec netdev header, used for h/w accelerated implementations. * * Copyright (c) 2015 Sabrina Dubroca <sd@queasysnail.net> */ #ifndef _NET_MACSEC_H_ #define _NET_MACSEC_H_ #include <linux/u64_stats_sync.h> #include <linux/if_vlan.h> #include <uapi/linux/if_link.h> #include <uapi/linux/if_macsec.h> #define MACSEC_DEFAULT_PN_LEN 4 #define MACSEC_XPN_PN_LEN 8 #define MACSEC_NUM_AN 4 /* 2 bits for the association number */ #define MACSEC_SCI_LEN 8 #define MACSEC_PORT_ES (htons(0x0001)) #define MACSEC_TCI_VERSION 0x80 #define MACSEC_TCI_ES 0x40 /* end station */ #define MACSEC_TCI_SC 0x20 /* SCI present */ #define MACSEC_TCI_SCB 0x10 /* epon */ #define MACSEC_TCI_E 0x08 /* encryption */ #define MACSEC_TCI_C 0x04 /* changed text */ #define MACSEC_AN_MASK 0x03 /* association number */ #define MACSEC_TCI_CONFID (MACSEC_TCI_E | MACSEC_TCI_C) #define MACSEC_DEFAULT_ICV_LEN 16 typedef u64 __bitwise sci_t; typedef u32 __bitwise ssci_t; struct metadata_dst; typedef union salt { struct { u32 ssci; u64 pn; } __packed; u8 bytes[MACSEC_SALT_LEN]; } __packed salt_t; typedef union pn { struct { #if defined(__LITTLE_ENDIAN_BITFIELD) u32 lower; u32 upper; #elif defined(__BIG_ENDIAN_BITFIELD) u32 upper; u32 lower; #else #error "Please fix <asm/byteorder.h>" #endif }; u64 full64; } pn_t; /** * struct macsec_key - SA key * @id: user-provided key identifier * @tfm: crypto struct, key storage * @salt: salt used to generate IV in XPN cipher suites */ struct macsec_key { u8 id[MACSEC_KEYID_LEN]; struct crypto_aead *tfm; salt_t salt; }; struct macsec_rx_sc_stats { __u64 InOctetsValidated; __u64 InOctetsDecrypted; __u64 InPktsUnchecked; __u64 InPktsDelayed; __u64 InPktsOK; __u64 InPktsInvalid; __u64 InPktsLate; __u64 InPktsNotValid; __u64 InPktsNotUsingSA; __u64 InPktsUnusedSA; }; struct macsec_rx_sa_stats { __u32 InPktsOK; __u32 InPktsInvalid; __u32 InPktsNotValid; __u32 InPktsNotUsingSA; __u32 InPktsUnusedSA; }; struct macsec_tx_sa_stats { __u32 OutPktsProtected; __u32 OutPktsEncrypted; }; struct macsec_tx_sc_stats { __u64 OutPktsProtected; __u64 OutPktsEncrypted; __u64 OutOctetsProtected; __u64 OutOctetsEncrypted; }; struct macsec_dev_stats { __u64 OutPktsUntagged; __u64 InPktsUntagged; __u64 OutPktsTooLong; __u64 InPktsNoTag; __u64 InPktsBadTag; __u64 InPktsUnknownSCI; __u64 InPktsNoSCI; __u64 InPktsOverrun; }; /** * struct macsec_rx_sa - receive secure association * @active: * @next_pn: packet number expected for the next packet * @lock: protects next_pn manipulations * @key: key structure * @ssci: short secure channel identifier * @stats: per-SA stats */ struct macsec_rx_sa { struct macsec_key key; ssci_t ssci; spinlock_t lock; union { pn_t next_pn_halves; u64 next_pn; }; refcount_t refcnt; bool active; struct macsec_rx_sa_stats __percpu *stats; struct macsec_rx_sc *sc; struct rcu_head rcu; }; struct pcpu_rx_sc_stats { struct macsec_rx_sc_stats stats; struct u64_stats_sync syncp; }; struct pcpu_tx_sc_stats { struct macsec_tx_sc_stats stats; struct u64_stats_sync syncp; }; /** * struct macsec_rx_sc - receive secure channel * @sci: secure channel identifier for this SC * @active: channel is active * @sa: array of secure associations * @stats: per-SC stats */ struct macsec_rx_sc { struct macsec_rx_sc __rcu *next; sci_t sci; bool active; struct macsec_rx_sa __rcu *sa[MACSEC_NUM_AN]; struct pcpu_rx_sc_stats __percpu *stats; refcount_t refcnt; struct rcu_head rcu_head; }; /** * struct macsec_tx_sa - transmit secure association * @active: * @next_pn: packet number to use for the next packet * @lock: protects next_pn manipulations * @key: key structure * @ssci: short secure channel identifier * @stats: per-SA stats */ struct macsec_tx_sa { struct macsec_key key; ssci_t ssci; spinlock_t lock; union { pn_t next_pn_halves; u64 next_pn; }; refcount_t refcnt; bool active; struct macsec_tx_sa_stats __percpu *stats; struct rcu_head rcu; }; /** * struct macsec_tx_sc - transmit secure channel * @active: * @encoding_sa: association number of the SA currently in use * @encrypt: encrypt packets on transmit, or authenticate only * @send_sci: always include the SCI in the SecTAG * @end_station: * @scb: single copy broadcast flag * @sa: array of secure associations * @stats: stats for this TXSC * @md_dst: MACsec offload metadata dst */ struct macsec_tx_sc { bool active; u8 encoding_sa; bool encrypt; bool send_sci; bool end_station; bool scb; struct macsec_tx_sa __rcu *sa[MACSEC_NUM_AN]; struct pcpu_tx_sc_stats __percpu *stats; struct metadata_dst *md_dst; }; /** * struct macsec_secy - MACsec Security Entity * @netdev: netdevice for this SecY * @n_rx_sc: number of receive secure channels configured on this SecY * @sci: secure channel identifier used for tx * @key_len: length of keys used by the cipher suite * @icv_len: length of ICV used by the cipher suite * @validate_frames: validation mode * @xpn: enable XPN for this SecY * @operational: MAC_Operational flag * @protect_frames: enable protection for this SecY * @replay_protect: enable packet number checks on receive * @replay_window: size of the replay window * @tx_sc: transmit secure channel * @rx_sc: linked list of receive secure channels */ struct macsec_secy { struct net_device *netdev; unsigned int n_rx_sc; sci_t sci; u16 key_len; u16 icv_len; enum macsec_validation_type validate_frames; bool xpn; bool operational; bool protect_frames; bool replay_protect; u32 replay_window; struct macsec_tx_sc tx_sc; struct macsec_rx_sc __rcu *rx_sc; }; /** * struct macsec_context - MACsec context for hardware offloading */ struct macsec_context { union { struct net_device *netdev; struct phy_device *phydev; }; enum macsec_offload offload; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct { unsigned char assoc_num; u8 key[MACSEC_MAX_KEY_LEN]; union { struct macsec_rx_sa *rx_sa; struct macsec_tx_sa *tx_sa; }; } sa; union { struct macsec_tx_sc_stats *tx_sc_stats; struct macsec_tx_sa_stats *tx_sa_stats; struct macsec_rx_sc_stats *rx_sc_stats; struct macsec_rx_sa_stats *rx_sa_stats; struct macsec_dev_stats *dev_stats; } stats; }; /** * struct macsec_ops - MACsec offloading operations */ struct macsec_ops { /* Device wide */ int (*mdo_dev_open)(struct macsec_context *ctx); int (*mdo_dev_stop)(struct macsec_context *ctx); /* SecY */ int (*mdo_add_secy)(struct macsec_context *ctx); int (*mdo_upd_secy)(struct macsec_context *ctx); int (*mdo_del_secy)(struct macsec_context *ctx); /* Security channels */ int (*mdo_add_rxsc)(struct macsec_context *ctx); int (*mdo_upd_rxsc)(struct macsec_context *ctx); int (*mdo_del_rxsc)(struct macsec_context *ctx); /* Security associations */ int (*mdo_add_rxsa)(struct macsec_context *ctx); int (*mdo_upd_rxsa)(struct macsec_context *ctx); int (*mdo_del_rxsa)(struct macsec_context *ctx); int (*mdo_add_txsa)(struct macsec_context *ctx); int (*mdo_upd_txsa)(struct macsec_context *ctx); int (*mdo_del_txsa)(struct macsec_context *ctx); /* Statistics */ int (*mdo_get_dev_stats)(struct macsec_context *ctx); int (*mdo_get_tx_sc_stats)(struct macsec_context *ctx); int (*mdo_get_tx_sa_stats)(struct macsec_context *ctx); int (*mdo_get_rx_sc_stats)(struct macsec_context *ctx); int (*mdo_get_rx_sa_stats)(struct macsec_context *ctx); }; void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa); static inline bool macsec_send_sci(const struct macsec_secy *secy) { const struct macsec_tx_sc *tx_sc = &secy->tx_sc; return tx_sc->send_sci || (secy->n_rx_sc > 1 && !tx_sc->end_station && !tx_sc->scb); } struct net_device *macsec_get_real_dev(const struct net_device *dev); bool macsec_netdev_is_offloaded(struct net_device *dev); static inline void *macsec_netdev_priv(const struct net_device *dev) { #if IS_ENABLED(CONFIG_VLAN_8021Q) if (is_vlan_dev(dev)) return netdev_priv(vlan_dev_priv(dev)->real_dev); #endif return netdev_priv(dev); } #endif /* _NET_MACSEC_H_ */
582 356 806 403 211 211 211 210 211 211 2 17 166 282 418 270 270 270 1 403 254 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _IP6_FIB_H #define _IP6_FIB_H #include <linux/ipv6_route.h> #include <linux/rtnetlink.h> #include <linux/spinlock.h> #include <linux/notifier.h> #include <net/dst.h> #include <net/flow.h> #include <net/ip_fib.h> #include <net/netlink.h> #include <net/inetpeer.h> #include <net/fib_notifier.h> #include <linux/indirect_call_wrapper.h> #include <uapi/linux/bpf.h> #ifdef CONFIG_IPV6_MULTIPLE_TABLES #define FIB6_TABLE_HASHSZ 256 #else #define FIB6_TABLE_HASHSZ 1 #endif #define RT6_DEBUG 2 #if RT6_DEBUG >= 3 #define RT6_TRACE(x...) pr_debug(x) #else #define RT6_TRACE(x...) do { ; } while (0) #endif struct rt6_info; struct fib6_info; struct fib6_config { u32 fc_table; u32 fc_metric; int fc_dst_len; int fc_src_len; int fc_ifindex; u32 fc_flags; u32 fc_protocol; u16 fc_type; /* only 8 bits are used */ u16 fc_delete_all_nh : 1, fc_ignore_dev_down:1, __unused : 14; u32 fc_nh_id; struct in6_addr fc_dst; struct in6_addr fc_src; struct in6_addr fc_prefsrc; struct in6_addr fc_gateway; unsigned long fc_expires; struct nlattr *fc_mx; int fc_mx_len; int fc_mp_len; struct nlattr *fc_mp; struct nl_info fc_nlinfo; struct nlattr *fc_encap; u16 fc_encap_type; bool fc_is_fdb; }; struct fib6_node { struct fib6_node __rcu *parent; struct fib6_node __rcu *left; struct fib6_node __rcu *right; #ifdef CONFIG_IPV6_SUBTREES struct fib6_node __rcu *subtree; #endif struct fib6_info __rcu *leaf; __u16 fn_bit; /* bit key */ __u16 fn_flags; int fn_sernum; struct fib6_info __rcu *rr_ptr; struct rcu_head rcu; }; struct fib6_gc_args { int timeout; int more; }; #ifndef CONFIG_IPV6_SUBTREES #define FIB6_SUBTREE(fn) NULL static inline bool fib6_routes_require_src(const struct net *net) { return false; } static inline void fib6_routes_require_src_inc(struct net *net) {} static inline void fib6_routes_require_src_dec(struct net *net) {} #else static inline bool fib6_routes_require_src(const struct net *net) { return net->ipv6.fib6_routes_require_src > 0; } static inline void fib6_routes_require_src_inc(struct net *net) { net->ipv6.fib6_routes_require_src++; } static inline void fib6_routes_require_src_dec(struct net *net) { net->ipv6.fib6_routes_require_src--; } #define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif /* * routing information * */ struct rt6key { struct in6_addr addr; int plen; }; struct fib6_table; struct rt6_exception_bucket { struct hlist_head chain; int depth; }; struct rt6_exception { struct hlist_node hlist; struct rt6_info *rt6i; unsigned long stamp; struct rcu_head rcu; }; #define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10 #define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT) #define FIB6_MAX_DEPTH 5 struct fib6_nh { struct fib_nh_common nh_common; #ifdef CONFIG_IPV6_ROUTER_PREF unsigned long last_probe; #endif struct rt6_info * __percpu *rt6i_pcpu; struct rt6_exception_bucket __rcu *rt6i_exception_bucket; }; struct fib6_info { struct fib6_table *fib6_table; struct fib6_info __rcu *fib6_next; struct fib6_node __rcu *fib6_node; /* Multipath routes: * siblings is a list of fib6_info that have the same metric/weight, * destination, but not the same gateway. nsiblings is just a cache * to speed up lookup. */ union { struct list_head fib6_siblings; struct list_head nh_list; }; unsigned int fib6_nsiblings; refcount_t fib6_ref; unsigned long expires; struct hlist_node gc_link; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] struct rt6key fib6_dst; u32 fib6_flags; struct rt6key fib6_src; struct rt6key fib6_prefsrc; u32 fib6_metric; u8 fib6_protocol; u8 fib6_type; u8 offload; u8 trap; u8 offload_failed; u8 should_flush:1, dst_nocount:1, dst_nopolicy:1, fib6_destroying:1, unused:4; struct rcu_head rcu; struct nexthop *nh; struct fib6_nh fib6_nh[]; }; struct rt6_info { struct dst_entry dst; struct fib6_info __rcu *from; int sernum; struct rt6key rt6i_dst; struct rt6key rt6i_src; struct in6_addr rt6i_gateway; struct inet6_dev *rt6i_idev; u32 rt6i_flags; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; }; struct fib6_result { struct fib6_nh *nh; struct fib6_info *f6i; u32 fib6_flags; u8 fib6_type; struct rt6_info *rt6; }; #define for_each_fib6_node_rt_rcu(fn) \ for (rt = rcu_dereference((fn)->leaf); rt; \ rt = rcu_dereference(rt->fib6_next)) #define for_each_fib6_walker_rt(w) \ for (rt = (w)->leaf; rt; \ rt = rcu_dereference_protected(rt->fib6_next, 1)) static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) { return ((struct rt6_info *)dst)->rt6i_idev; } static inline bool fib6_requires_src(const struct fib6_info *rt) { return rt->fib6_src.plen > 0; } static inline bool fib6_check_expired(const struct fib6_info *f6i) { if (f6i->fib6_flags & RTF_EXPIRES) return time_after(jiffies, f6i->expires); return false; } static inline bool fib6_has_expires(const struct fib6_info *f6i) { return f6i->fib6_flags & RTF_EXPIRES; } /* Function to safely get fn->fn_sernum for passed in rt * and store result in passed in cookie. * Return true if we can get cookie safely * Return false if not */ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, u32 *cookie) { struct fib6_node *fn; bool status = false; fn = rcu_dereference(f6i->fib6_node); if (fn) { *cookie = READ_ONCE(fn->fn_sernum); /* pairs with smp_wmb() in __fib6_update_sernum_upto_root() */ smp_rmb(); status = true; } return status; } static inline u32 rt6_get_cookie(const struct rt6_info *rt) { struct fib6_info *from; u32 cookie = 0; if (rt->sernum) return rt->sernum; rcu_read_lock(); from = rcu_dereference(rt->from); if (from) fib6_get_cookie_safe(from, &cookie); rcu_read_unlock(); return cookie; } static inline void ip6_rt_put(struct rt6_info *rt) { /* dst_release() accepts a NULL parameter. * We rely on dst being first structure in struct rt6_info */ BUILD_BUG_ON(offsetof(struct rt6_info, dst) != 0); dst_release(&rt->dst); } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh); void fib6_info_destroy_rcu(struct rcu_head *head); static inline void fib6_info_hold(struct fib6_info *f6i) { refcount_inc(&f6i->fib6_ref); } static inline bool fib6_info_hold_safe(struct fib6_info *f6i) { return refcount_inc_not_zero(&f6i->fib6_ref); } static inline void fib6_info_release(struct fib6_info *f6i) { if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) call_rcu(&f6i->rcu, fib6_info_destroy_rcu); } enum fib6_walk_state { #ifdef CONFIG_IPV6_SUBTREES FWS_S, #endif FWS_L, FWS_R, FWS_C, FWS_U }; struct fib6_walker { struct list_head lh; struct fib6_node *root, *node; struct fib6_info *leaf; enum fib6_walk_state state; unsigned int skip; unsigned int count; unsigned int skip_in_node; int (*func)(struct fib6_walker *); void *args; }; struct rt6_statistics { __u32 fib_nodes; /* all fib6 nodes */ __u32 fib_route_nodes; /* intermediate nodes */ __u32 fib_rt_entries; /* rt entries in fib table */ __u32 fib_rt_cache; /* cached rt entries in exception table */ __u32 fib_discarded_routes; /* total number of routes delete */ /* The following stat is not protected by any lock */ atomic_t fib_rt_alloc; /* total number of routes alloced */ }; #define RTN_TL_ROOT 0x0001 #define RTN_ROOT 0x0002 /* tree root node */ #define RTN_RTINFO 0x0004 /* node with valid routing info */ /* * priority levels (or metrics) * */ struct fib6_table { struct hlist_node tb6_hlist; u32 tb6_id; spinlock_t tb6_lock; struct fib6_node tb6_root; struct inet_peer_base tb6_peers; unsigned int flags; unsigned int fib_seq; struct hlist_head tb6_gc_hlist; /* GC candidates */ #define RT6_TABLE_HAS_DFLT_ROUTER BIT(0) }; #define RT6_TABLE_UNSPEC RT_TABLE_UNSPEC #define RT6_TABLE_MAIN RT_TABLE_MAIN #define RT6_TABLE_DFLT RT6_TABLE_MAIN #define RT6_TABLE_INFO RT6_TABLE_MAIN #define RT6_TABLE_PREFIX RT6_TABLE_MAIN #ifdef CONFIG_IPV6_MULTIPLE_TABLES #define FIB6_TABLE_MIN 1 #define FIB6_TABLE_MAX RT_TABLE_MAX #define RT6_TABLE_LOCAL RT_TABLE_LOCAL #else #define FIB6_TABLE_MIN RT_TABLE_MAIN #define FIB6_TABLE_MAX FIB6_TABLE_MIN #define RT6_TABLE_LOCAL RT6_TABLE_MAIN #endif typedef struct rt6_info *(*pol_lookup_t)(struct net *, struct fib6_table *, struct flowi6 *, const struct sk_buff *, int); struct fib6_entry_notifier_info { struct fib_notifier_info info; /* must be first */ struct fib6_info *rt; unsigned int nsiblings; }; /* * exported functions */ struct fib6_table *fib6_get_table(struct net *net, u32 id); struct fib6_table *fib6_new_table(struct net *net, u32 id); struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); /* called with rcu lock held; can return error pointer * caller needs to select path */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags); /* called with rcu lock held; caller needs to select path */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict); void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict); struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr); struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match); void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg), void *arg); void fib6_clean_all_skip_notify(struct net *net, int (*func)(struct fib6_info *, void *arg), void *arg); int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack); int fib6_del(struct fib6_info *rt, struct nl_info *info); static inline void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr) { const struct fib6_info *from; rcu_read_lock(); from = rcu_dereference(rt->from); if (from) *addr = from->fib6_prefsrc.addr; else *addr = in6addr_any; rcu_read_unlock(); } int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); void fib6_nh_release(struct fib6_nh *fib6_nh); void fib6_nh_release_dsts(struct fib6_nh *fib6_nh); int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack); int call_fib6_multipath_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack); int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt); void fib6_rt_update(struct net *net, struct fib6_info *rt, struct nl_info *info); void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); void fib6_run_gc(unsigned long expires, struct net *net, bool force); void fib6_gc_cleanup(void); int fib6_init(void); /* fib6_info must be locked by the caller, and fib6_info->fib6_table can be * NULL. */ static inline void fib6_set_expires_locked(struct fib6_info *f6i, unsigned long expires) { struct fib6_table *tb6; tb6 = f6i->fib6_table; f6i->expires = expires; if (tb6 && !fib6_has_expires(f6i)) hlist_add_head(&f6i->gc_link, &tb6->tb6_gc_hlist); f6i->fib6_flags |= RTF_EXPIRES; } /* fib6_info must be locked by the caller, and fib6_info->fib6_table can be * NULL. If fib6_table is NULL, the fib6_info will no be inserted into the * list of GC candidates until it is inserted into a table. */ static inline void fib6_set_expires(struct fib6_info *f6i, unsigned long expires) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_set_expires_locked(f6i, expires); spin_unlock_bh(&f6i->fib6_table->tb6_lock); } static inline void fib6_clean_expires_locked(struct fib6_info *f6i) { if (fib6_has_expires(f6i)) hlist_del_init(&f6i->gc_link); f6i->fib6_flags &= ~RTF_EXPIRES; f6i->expires = 0; } static inline void fib6_clean_expires(struct fib6_info *f6i) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_clean_expires_locked(f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); } struct ipv6_route_iter { struct seq_net_private p; struct fib6_walker w; loff_t skip; struct fib6_table *tbl; int sernum; }; extern const struct seq_operations ipv6_route_seq_ops; int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); int __net_init fib6_notifier_init(struct net *net); void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); void fib6_update_sernum(struct net *net, struct fib6_info *rt); void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt); void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i); void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val); static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) { return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); } void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, bool offload, bool trap, bool offload_failed); #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) struct bpf_iter__ipv6_route { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct fib6_info *, rt); }; #endif INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); INDIRECT_CALLABLE_DECLARE(struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); static inline struct rt6_info *pol_lookup_func(pol_lookup_t lookup, struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return INDIRECT_CALL_4(lookup, ip6_pol_route_output, ip6_pol_route_input, ip6_pol_route_lookup, __ip6_route_redirect, net, table, fl6, skb, flags); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static inline bool fib6_has_custom_rules(const struct net *net) { return net->ipv6.fib6_has_custom_rules; } int fib6_rules_init(void); void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); unsigned int fib6_rules_seq_read(struct net *net); static inline bool fib6_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi6 *fl6, struct flow_keys *flkeys) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; if (!net->ipv6.fib6_rules_require_fldissect) return false; memset(flkeys, 0, sizeof(*flkeys)); __skb_flow_dissect(net, skb, &flow_keys_dissector, flkeys, NULL, 0, 0, 0, flag); fl6->fl6_sport = flkeys->ports.src; fl6->fl6_dport = flkeys->ports.dst; fl6->flowi6_proto = flkeys->basic.ip_proto; return true; } #else static inline bool fib6_has_custom_rules(const struct net *net) { return false; } static inline int fib6_rules_init(void) { return 0; } static inline void fib6_rules_cleanup(void) { return ; } static inline bool fib6_rule_default(const struct fib_rule *rule) { return true; } static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static inline unsigned int fib6_rules_seq_read(struct net *net) { return 0; } static inline bool fib6_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi6 *fl6, struct flow_keys *flkeys) { return false; } #endif #endif
1 165 170 60 165 165 165 165 165 165 1 6 6 2 2 1 1 6 6 60 60 60 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 // SPDX-License-Identifier: GPL-2.0 /* Multipath TCP token management * Copyright (c) 2017 - 2019, Intel Corporation. * * Note: This code is based on mptcp_ctrl.c from multipath-tcp.org, * authored by: * * Sébastien Barré <sebastien.barre@uclouvain.be> * Christoph Paasch <christoph.paasch@uclouvain.be> * Jaakko Korkeaniemi <jaakko.korkeaniemi@aalto.fi> * Gregory Detal <gregory.detal@uclouvain.be> * Fabien Duchêne <fabien.duchene@uclouvain.be> * Andreas Seelinger <Andreas.Seelinger@rwth-aachen.de> * Lavkesh Lahngir <lavkesh51@gmail.com> * Andreas Ripke <ripke@neclab.eu> * Vlad Dogaru <vlad.dogaru@intel.com> * Octavian Purdila <octavian.purdila@intel.com> * John Ronan <jronan@tssg.org> * Catalin Nicutar <catalin.nicutar@gmail.com> * Brandon Heller <brandonh@stanford.edu> */ #define pr_fmt(fmt) "MPTCP: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/memblock.h> #include <linux/ip.h> #include <linux/tcp.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/protocol.h> #include <net/mptcp.h> #include "protocol.h" #define TOKEN_MAX_CHAIN_LEN 4 struct token_bucket { spinlock_t lock; int chain_len; struct hlist_nulls_head req_chain; struct hlist_nulls_head msk_chain; }; static struct token_bucket *token_hash __read_mostly; static unsigned int token_mask __read_mostly; static struct token_bucket *token_bucket(u32 token) { return &token_hash[token & token_mask]; } /* called with bucket lock held */ static struct mptcp_subflow_request_sock * __token_lookup_req(struct token_bucket *t, u32 token) { struct mptcp_subflow_request_sock *req; struct hlist_nulls_node *pos; hlist_nulls_for_each_entry_rcu(req, pos, &t->req_chain, token_node) if (req->token == token) return req; return NULL; } /* called with bucket lock held */ static struct mptcp_sock * __token_lookup_msk(struct token_bucket *t, u32 token) { struct hlist_nulls_node *pos; struct sock *sk; sk_nulls_for_each_rcu(sk, pos, &t->msk_chain) if (mptcp_sk(sk)->token == token) return mptcp_sk(sk); return NULL; } static bool __token_bucket_busy(struct token_bucket *t, u32 token) { return !token || t->chain_len >= TOKEN_MAX_CHAIN_LEN || __token_lookup_req(t, token) || __token_lookup_msk(t, token); } static void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) { /* we might consider a faster version that computes the key as a * hash of some information available in the MPTCP socket. Use * random data at the moment, as it's probably the safest option * in case multiple sockets are opened in different namespaces at * the same time. */ get_random_bytes(key, sizeof(u64)); mptcp_crypto_key_sha(*key, token, idsn); } /** * mptcp_token_new_request - create new key/idsn/token for subflow_request * @req: the request socket * * This function is called when a new mptcp connection is coming in. * * It creates a unique token to identify the new mptcp connection, * a secret local key and the initial data sequence number (idsn). * * Returns 0 on success. */ int mptcp_token_new_request(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct token_bucket *bucket; u32 token; mptcp_crypto_key_sha(subflow_req->local_key, &subflow_req->token, &subflow_req->idsn); pr_debug("req=%p local_key=%llu, token=%u, idsn=%llu\n", req, subflow_req->local_key, subflow_req->token, subflow_req->idsn); token = subflow_req->token; bucket = token_bucket(token); spin_lock_bh(&bucket->lock); if (__token_bucket_busy(bucket, token)) { spin_unlock_bh(&bucket->lock); return -EBUSY; } hlist_nulls_add_head_rcu(&subflow_req->token_node, &bucket->req_chain); bucket->chain_len++; spin_unlock_bh(&bucket->lock); return 0; } /** * mptcp_token_new_connect - create new key/idsn/token for subflow * @ssk: the socket that will initiate a connection * * This function is called when a new outgoing mptcp connection is * initiated. * * It creates a unique token to identify the new mptcp connection, * a secret local key and the initial data sequence number (idsn). * * On success, the mptcp connection can be found again using * the computed token at a later time, this is needed to process * join requests. * * returns 0 on success. */ int mptcp_token_new_connect(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct mptcp_sock *msk = mptcp_sk(subflow->conn); int retries = MPTCP_TOKEN_MAX_RETRIES; struct sock *sk = subflow->conn; struct token_bucket *bucket; again: mptcp_crypto_key_gen_sha(&subflow->local_key, &subflow->token, &subflow->idsn); bucket = token_bucket(subflow->token); spin_lock_bh(&bucket->lock); if (__token_bucket_busy(bucket, subflow->token)) { spin_unlock_bh(&bucket->lock); if (!--retries) return -EBUSY; goto again; } pr_debug("ssk=%p, local_key=%llu, token=%u, idsn=%llu\n", ssk, subflow->local_key, subflow->token, subflow->idsn); WRITE_ONCE(msk->token, subflow->token); __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); bucket->chain_len++; spin_unlock_bh(&bucket->lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); return 0; } /** * mptcp_token_accept - replace a req sk with full sock in token hash * @req: the request socket to be removed * @msk: the just cloned socket linked to the new connection * * Called when a SYN packet creates a new logical connection, i.e. * is not a join request. */ void mptcp_token_accept(struct mptcp_subflow_request_sock *req, struct mptcp_sock *msk) { struct mptcp_subflow_request_sock *pos; struct sock *sk = (struct sock *)msk; struct token_bucket *bucket; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); bucket = token_bucket(req->token); spin_lock_bh(&bucket->lock); /* pedantic lookup check for the moved token */ pos = __token_lookup_req(bucket, req->token); if (!WARN_ON_ONCE(pos != req)) hlist_nulls_del_init_rcu(&req->token_node); __sk_nulls_add_node_rcu((struct sock *)msk, &bucket->msk_chain); spin_unlock_bh(&bucket->lock); } bool mptcp_token_exists(u32 token) { struct hlist_nulls_node *pos; struct token_bucket *bucket; struct mptcp_sock *msk; struct sock *sk; rcu_read_lock(); bucket = token_bucket(token); again: sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { msk = mptcp_sk(sk); if (READ_ONCE(msk->token) == token) goto found; } if (get_nulls_value(pos) != (token & token_mask)) goto again; rcu_read_unlock(); return false; found: rcu_read_unlock(); return true; } /** * mptcp_token_get_sock - retrieve mptcp connection sock using its token * @net: restrict to this namespace * @token: token of the mptcp connection to retrieve * * This function returns the mptcp connection structure with the given token. * A reference count on the mptcp socket returned is taken. * * returns NULL if no connection with the given token value exists. */ struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token) { struct hlist_nulls_node *pos; struct token_bucket *bucket; struct mptcp_sock *msk; struct sock *sk; rcu_read_lock(); bucket = token_bucket(token); again: sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { msk = mptcp_sk(sk); if (READ_ONCE(msk->token) != token || !net_eq(sock_net(sk), net)) continue; if (!refcount_inc_not_zero(&sk->sk_refcnt)) goto not_found; if (READ_ONCE(msk->token) != token || !net_eq(sock_net(sk), net)) { sock_put(sk); goto again; } goto found; } if (get_nulls_value(pos) != (token & token_mask)) goto again; not_found: msk = NULL; found: rcu_read_unlock(); return msk; } EXPORT_SYMBOL_GPL(mptcp_token_get_sock); /** * mptcp_token_iter_next - iterate over the token container from given pos * @net: namespace to be iterated * @s_slot: start slot number * @s_num: start number inside the given lock * * This function returns the first mptcp connection structure found inside the * token container starting from the specified position, or NULL. * * On successful iteration, the iterator is moved to the next position and * a reference to the returned socket is acquired. */ struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, long *s_num) { struct mptcp_sock *ret = NULL; struct hlist_nulls_node *pos; int slot, num = 0; for (slot = *s_slot; slot <= token_mask; *s_num = 0, slot++) { struct token_bucket *bucket = &token_hash[slot]; struct sock *sk; num = 0; if (hlist_nulls_empty(&bucket->msk_chain)) continue; rcu_read_lock(); sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { ++num; if (!net_eq(sock_net(sk), net)) continue; if (num <= *s_num) continue; if (!refcount_inc_not_zero(&sk->sk_refcnt)) continue; if (!net_eq(sock_net(sk), net)) { sock_put(sk); continue; } ret = mptcp_sk(sk); rcu_read_unlock(); goto out; } rcu_read_unlock(); } out: *s_slot = slot; *s_num = num; return ret; } EXPORT_SYMBOL_GPL(mptcp_token_iter_next); /** * mptcp_token_destroy_request - remove mptcp connection/token * @req: mptcp request socket dropping the token * * Remove the token associated to @req. */ void mptcp_token_destroy_request(struct request_sock *req) { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_subflow_request_sock *pos; struct token_bucket *bucket; if (hlist_nulls_unhashed(&subflow_req->token_node)) return; bucket = token_bucket(subflow_req->token); spin_lock_bh(&bucket->lock); pos = __token_lookup_req(bucket, subflow_req->token); if (!WARN_ON_ONCE(pos != subflow_req)) { hlist_nulls_del_init_rcu(&pos->token_node); bucket->chain_len--; } spin_unlock_bh(&bucket->lock); } /** * mptcp_token_destroy - remove mptcp connection/token * @msk: mptcp connection dropping the token * * Remove the token associated to @msk */ void mptcp_token_destroy(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; struct token_bucket *bucket; struct mptcp_sock *pos; if (sk_unhashed((struct sock *)msk)) return; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); bucket = token_bucket(msk->token); spin_lock_bh(&bucket->lock); pos = __token_lookup_msk(bucket, msk->token); if (!WARN_ON_ONCE(pos != msk)) { __sk_nulls_del_node_init_rcu((struct sock *)pos); bucket->chain_len--; } spin_unlock_bh(&bucket->lock); WRITE_ONCE(msk->token, 0); } void __init mptcp_token_init(void) { int i; token_hash = alloc_large_system_hash("MPTCP token", sizeof(struct token_bucket), 0, 20,/* one slot per 1MB of memory */ HASH_ZERO, NULL, &token_mask, 0, 64 * 1024); for (i = 0; i < token_mask + 1; ++i) { INIT_HLIST_NULLS_HEAD(&token_hash[i].req_chain, i); INIT_HLIST_NULLS_HEAD(&token_hash[i].msk_chain, i); spin_lock_init(&token_hash[i].lock); } } #if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST) EXPORT_SYMBOL_GPL(mptcp_token_new_request); EXPORT_SYMBOL_GPL(mptcp_token_new_connect); EXPORT_SYMBOL_GPL(mptcp_token_accept); EXPORT_SYMBOL_GPL(mptcp_token_destroy_request); EXPORT_SYMBOL_GPL(mptcp_token_destroy); #endif
13098 1271 2200 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 /* SPDX-License-Identifier: GPL-2.0-only */ /* * AppArmor security module * * This file contains AppArmor label definitions * * Copyright 2017 Canonical Ltd. */ #ifndef __AA_LABEL_H #define __AA_LABEL_H #include <linux/atomic.h> #include <linux/audit.h> #include <linux/rbtree.h> #include <linux/rcupdate.h> #include "apparmor.h" #include "lib.h" struct aa_ns; #define LOCAL_VEC_ENTRIES 8 #define DEFINE_VEC(T, V) \ struct aa_ ## T *(_ ## V ## _localtmp)[LOCAL_VEC_ENTRIES]; \ struct aa_ ## T **(V) #define vec_setup(T, V, N, GFP) \ ({ \ if ((N) <= LOCAL_VEC_ENTRIES) { \ typeof(N) i; \ (V) = (_ ## V ## _localtmp); \ for (i = 0; i < (N); i++) \ (V)[i] = NULL; \ } else \ (V) = kzalloc(sizeof(struct aa_ ## T *) * (N), (GFP)); \ (V) ? 0 : -ENOMEM; \ }) #define vec_cleanup(T, V, N) \ do { \ int i; \ for (i = 0; i < (N); i++) { \ if (!IS_ERR_OR_NULL((V)[i])) \ aa_put_ ## T((V)[i]); \ } \ if ((V) != _ ## V ## _localtmp) \ kfree(V); \ } while (0) #define vec_last(VEC, SIZE) ((VEC)[(SIZE) - 1]) #define vec_ns(VEC, SIZE) (vec_last((VEC), (SIZE))->ns) #define vec_labelset(VEC, SIZE) (&vec_ns((VEC), (SIZE))->labels) #define cleanup_domain_vec(V, L) cleanup_label_vec((V), (L)->size) struct aa_profile; #define VEC_FLAG_TERMINATE 1 int aa_vec_unique(struct aa_profile **vec, int n, int flags); struct aa_label *aa_vec_find_or_create_label(struct aa_profile **vec, int len, gfp_t gfp); #define aa_sort_and_merge_vec(N, V) \ aa_sort_and_merge_profiles((N), (struct aa_profile **)(V)) /* struct aa_labelset - set of labels for a namespace * * Labels are reference counted; aa_labelset does not contribute to label * reference counts. Once a label's last refcount is put it is removed from * the set. */ struct aa_labelset { rwlock_t lock; struct rb_root root; }; #define __labelset_for_each(LS, N) \ for ((N) = rb_first(&(LS)->root); (N); (N) = rb_next(N)) enum label_flags { FLAG_HAT = 1, /* profile is a hat */ FLAG_UNCONFINED = 2, /* label unconfined only if all */ FLAG_NULL = 4, /* profile is null learning profile */ FLAG_IX_ON_NAME_ERROR = 8, /* fallback to ix on name lookup fail */ FLAG_IMMUTIBLE = 0x10, /* don't allow changes/replacement */ FLAG_USER_DEFINED = 0x20, /* user based profile - lower privs */ FLAG_NO_LIST_REF = 0x40, /* list doesn't keep profile ref */ FLAG_NS_COUNT = 0x80, /* carries NS ref count */ FLAG_IN_TREE = 0x100, /* label is in tree */ FLAG_PROFILE = 0x200, /* label is a profile */ FLAG_EXPLICIT = 0x400, /* explicit static label */ FLAG_STALE = 0x800, /* replaced/removed */ FLAG_RENAMED = 0x1000, /* label has renaming in it */ FLAG_REVOKED = 0x2000, /* label has revocation in it */ FLAG_DEBUG1 = 0x4000, FLAG_DEBUG2 = 0x8000, /* These flags must correspond with PATH_flags */ /* TODO: add new path flags */ }; struct aa_label; struct aa_proxy { struct kref count; struct aa_label __rcu *label; }; struct label_it { int i, j; }; /* struct aa_label - lazy labeling struct * @count: ref count of active users * @node: rbtree position * @rcu: rcu callback struct * @proxy: is set to the label that replaced this label * @hname: text representation of the label (MAYBE_NULL) * @flags: stale and other flags - values may change under label set lock * @secid: secid that references this label * @size: number of entries in @ent[] * @ent: set of profiles for label, actual size determined by @size */ struct aa_label { struct kref count; struct rb_node node; struct rcu_head rcu; struct aa_proxy *proxy; __counted char *hname; long flags; u32 secid; int size; struct aa_profile *vec[]; }; #define last_error(E, FN) \ do { \ int __subE = (FN); \ if (__subE) \ (E) = __subE; \ } while (0) #define label_isprofile(X) ((X)->flags & FLAG_PROFILE) #define label_unconfined(X) ((X)->flags & FLAG_UNCONFINED) #define unconfined(X) label_unconfined(X) #define label_is_stale(X) ((X)->flags & FLAG_STALE) #define __label_make_stale(X) ((X)->flags |= FLAG_STALE) #define labels_ns(X) (vec_ns(&((X)->vec[0]), (X)->size)) #define labels_set(X) (&labels_ns(X)->labels) #define labels_view(X) labels_ns(X) #define labels_profile(X) ((X)->vec[(X)->size - 1]) int aa_label_next_confined(struct aa_label *l, int i); /* for each profile in a label */ #define label_for_each(I, L, P) \ for ((I).i = 0; ((P) = (L)->vec[(I).i]); ++((I).i)) /* assumes break/goto ended label_for_each */ #define label_for_each_cont(I, L, P) \ for (++((I).i); ((P) = (L)->vec[(I).i]); ++((I).i)) #define next_comb(I, L1, L2) \ do { \ (I).j++; \ if ((I).j >= (L2)->size) { \ (I).i++; \ (I).j = 0; \ } \ } while (0) /* for each combination of P1 in L1, and P2 in L2 */ #define label_for_each_comb(I, L1, L2, P1, P2) \ for ((I).i = (I).j = 0; \ ((P1) = (L1)->vec[(I).i]) && ((P2) = (L2)->vec[(I).j]); \ (I) = next_comb(I, L1, L2)) #define fn_for_each_comb(L1, L2, P1, P2, FN) \ ({ \ struct label_it i; \ int __E = 0; \ label_for_each_comb(i, (L1), (L2), (P1), (P2)) { \ last_error(__E, (FN)); \ } \ __E; \ }) /* for each profile that is enforcing confinement in a label */ #define label_for_each_confined(I, L, P) \ for ((I).i = aa_label_next_confined((L), 0); \ ((P) = (L)->vec[(I).i]); \ (I).i = aa_label_next_confined((L), (I).i + 1)) #define label_for_each_in_merge(I, A, B, P) \ for ((I).i = (I).j = 0; \ ((P) = aa_label_next_in_merge(&(I), (A), (B))); \ ) #define label_for_each_not_in_set(I, SET, SUB, P) \ for ((I).i = (I).j = 0; \ ((P) = __aa_label_next_not_in_set(&(I), (SET), (SUB))); \ ) #define next_in_ns(i, NS, L) \ ({ \ typeof(i) ___i = (i); \ while ((L)->vec[___i] && (L)->vec[___i]->ns != (NS)) \ (___i)++; \ (___i); \ }) #define label_for_each_in_ns(I, NS, L, P) \ for ((I).i = next_in_ns(0, (NS), (L)); \ ((P) = (L)->vec[(I).i]); \ (I).i = next_in_ns((I).i + 1, (NS), (L))) #define fn_for_each_in_ns(L, P, FN) \ ({ \ struct label_it __i; \ struct aa_ns *__ns = labels_ns(L); \ int __E = 0; \ label_for_each_in_ns(__i, __ns, (L), (P)) { \ last_error(__E, (FN)); \ } \ __E; \ }) #define fn_for_each_XXX(L, P, FN, ...) \ ({ \ struct label_it i; \ int __E = 0; \ label_for_each ## __VA_ARGS__(i, (L), (P)) { \ last_error(__E, (FN)); \ } \ __E; \ }) #define fn_for_each(L, P, FN) fn_for_each_XXX(L, P, FN) #define fn_for_each_confined(L, P, FN) fn_for_each_XXX(L, P, FN, _confined) #define fn_for_each2_XXX(L1, L2, P, FN, ...) \ ({ \ struct label_it i; \ int __E = 0; \ label_for_each ## __VA_ARGS__(i, (L1), (L2), (P)) { \ last_error(__E, (FN)); \ } \ __E; \ }) #define fn_for_each_in_merge(L1, L2, P, FN) \ fn_for_each2_XXX((L1), (L2), P, FN, _in_merge) #define fn_for_each_not_in_set(L1, L2, P, FN) \ fn_for_each2_XXX((L1), (L2), P, FN, _not_in_set) #define LABEL_MEDIATES(L, C) \ ({ \ struct aa_profile *profile; \ struct label_it i; \ int ret = 0; \ label_for_each(i, (L), profile) { \ if (RULE_MEDIATES(&profile->rules, (C))) { \ ret = 1; \ break; \ } \ } \ ret; \ }) void aa_labelset_destroy(struct aa_labelset *ls); void aa_labelset_init(struct aa_labelset *ls); void __aa_labelset_update_subtree(struct aa_ns *ns); void aa_label_destroy(struct aa_label *label); void aa_label_free(struct aa_label *label); void aa_label_kref(struct kref *kref); bool aa_label_init(struct aa_label *label, int size, gfp_t gfp); struct aa_label *aa_label_alloc(int size, struct aa_proxy *proxy, gfp_t gfp); bool aa_label_is_subset(struct aa_label *set, struct aa_label *sub); bool aa_label_is_unconfined_subset(struct aa_label *set, struct aa_label *sub); struct aa_profile *__aa_label_next_not_in_set(struct label_it *I, struct aa_label *set, struct aa_label *sub); bool aa_label_remove(struct aa_label *label); struct aa_label *aa_label_insert(struct aa_labelset *ls, struct aa_label *l); bool aa_label_replace(struct aa_label *old, struct aa_label *new); bool aa_label_make_newest(struct aa_labelset *ls, struct aa_label *old, struct aa_label *new); struct aa_label *aa_label_find(struct aa_label *l); struct aa_profile *aa_label_next_in_merge(struct label_it *I, struct aa_label *a, struct aa_label *b); struct aa_label *aa_label_find_merge(struct aa_label *a, struct aa_label *b); struct aa_label *aa_label_merge(struct aa_label *a, struct aa_label *b, gfp_t gfp); bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp); #define FLAGS_NONE 0 #define FLAG_SHOW_MODE 1 #define FLAG_VIEW_SUBNS 2 #define FLAG_HIDDEN_UNCONFINED 4 #define FLAG_ABS_ROOT 8 int aa_label_snxprint(char *str, size_t size, struct aa_ns *view, struct aa_label *label, int flags); int aa_label_asxprint(char **strp, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp); int aa_label_acntsxprint(char __counted **strp, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp); void aa_label_xaudit(struct audit_buffer *ab, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp); void aa_label_seq_xprint(struct seq_file *f, struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp); void aa_label_xprintk(struct aa_ns *ns, struct aa_label *label, int flags, gfp_t gfp); void aa_label_audit(struct audit_buffer *ab, struct aa_label *label, gfp_t gfp); void aa_label_seq_print(struct seq_file *f, struct aa_label *label, gfp_t gfp); void aa_label_printk(struct aa_label *label, gfp_t gfp); struct aa_label *aa_label_strn_parse(struct aa_label *base, const char *str, size_t n, gfp_t gfp, bool create, bool force_stack); struct aa_label *aa_label_parse(struct aa_label *base, const char *str, gfp_t gfp, bool create, bool force_stack); static inline const char *aa_label_strn_split(const char *str, int n) { const char *pos; aa_state_t state; state = aa_dfa_matchn_until(stacksplitdfa, DFA_START, str, n, &pos); if (!ACCEPT_TABLE(stacksplitdfa)[state]) return NULL; return pos - 3; } static inline const char *aa_label_str_split(const char *str) { const char *pos; aa_state_t state; state = aa_dfa_match_until(stacksplitdfa, DFA_START, str, &pos); if (!ACCEPT_TABLE(stacksplitdfa)[state]) return NULL; return pos - 3; } struct aa_perms; struct aa_ruleset; int aa_label_match(struct aa_profile *profile, struct aa_ruleset *rules, struct aa_label *label, aa_state_t state, bool subns, u32 request, struct aa_perms *perms); /** * __aa_get_label - get a reference count to uncounted label reference * @l: reference to get a count on * * Returns: pointer to reference OR NULL if race is lost and reference is * being repeated. * Requires: lock held, and the return code MUST be checked */ static inline struct aa_label *__aa_get_label(struct aa_label *l) { if (l && kref_get_unless_zero(&l->count)) return l; return NULL; } static inline struct aa_label *aa_get_label(struct aa_label *l) { if (l) kref_get(&(l->count)); return l; } /** * aa_get_label_rcu - increment refcount on a label that can be replaced * @l: pointer to label that can be replaced (NOT NULL) * * Returns: pointer to a refcounted label. * else NULL if no label */ static inline struct aa_label *aa_get_label_rcu(struct aa_label __rcu **l) { struct aa_label *c; rcu_read_lock(); do { c = rcu_dereference(*l); } while (c && !kref_get_unless_zero(&c->count)); rcu_read_unlock(); return c; } /** * aa_get_newest_label - find the newest version of @l * @l: the label to check for newer versions of * * Returns: refcounted newest version of @l taking into account * replacement, renames and removals * return @l. */ static inline struct aa_label *aa_get_newest_label(struct aa_label *l) { if (!l) return NULL; if (label_is_stale(l)) { struct aa_label *tmp; AA_BUG(!l->proxy); AA_BUG(!l->proxy->label); /* BUG: only way this can happen is @l ref count and its * replacement count have gone to 0 and are on their way * to destruction. ie. we have a refcounting error */ tmp = aa_get_label_rcu(&l->proxy->label); AA_BUG(!tmp); return tmp; } return aa_get_label(l); } static inline void aa_put_label(struct aa_label *l) { if (l) kref_put(&l->count, aa_label_kref); } struct aa_proxy *aa_alloc_proxy(struct aa_label *l, gfp_t gfp); void aa_proxy_kref(struct kref *kref); static inline struct aa_proxy *aa_get_proxy(struct aa_proxy *proxy) { if (proxy) kref_get(&(proxy->count)); return proxy; } static inline void aa_put_proxy(struct aa_proxy *proxy) { if (proxy) kref_put(&proxy->count, aa_proxy_kref); } void __aa_proxy_redirect(struct aa_label *orig, struct aa_label *new); #endif /* __AA_LABEL_H */
181 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 // SPDX-License-Identifier: GPL-2.0-or-later /* * Network event notifiers * * Authors: * Tom Tucker <tom@opengridcomputing.com> * Steve Wise <swise@opengridcomputing.com> * * Fixes: */ #include <linux/rtnetlink.h> #include <linux/notifier.h> #include <linux/export.h> #include <net/netevent.h> static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain); /** * register_netevent_notifier - register a netevent notifier block * @nb: notifier * * Register a notifier to be called when a netevent occurs. * The notifier passed is linked into the kernel structures and must * not be reused until it has been unregistered. A negative errno code * is returned on a failure. */ int register_netevent_notifier(struct notifier_block *nb) { return atomic_notifier_chain_register(&netevent_notif_chain, nb); } EXPORT_SYMBOL_GPL(register_netevent_notifier); /** * unregister_netevent_notifier - unregister a netevent notifier block * @nb: notifier * * Unregister a notifier previously registered by * register_neigh_notifier(). The notifier is unlinked into the * kernel structures and may then be reused. A negative errno code * is returned on a failure. */ int unregister_netevent_notifier(struct notifier_block *nb) { return atomic_notifier_chain_unregister(&netevent_notif_chain, nb); } EXPORT_SYMBOL_GPL(unregister_netevent_notifier); /** * call_netevent_notifiers - call all netevent notifier blocks * @val: value passed unmodified to notifier function * @v: pointer passed unmodified to notifier function * * Call all neighbour notifier blocks. Parameters and return value * are as for notifier_call_chain(). */ int call_netevent_notifiers(unsigned long val, void *v) { return atomic_notifier_call_chain(&netevent_notif_chain, val, v); } EXPORT_SYMBOL_GPL(call_netevent_notifiers);
16198 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 // SPDX-License-Identifier: GPL-2.0-only /* * xsave/xrstor support. * * Author: Suresh Siddha <suresh.b.siddha@intel.com> */ #include <linux/bitops.h> #include <linux/compat.h> #include <linux/cpu.h> #include <linux/mman.h> #include <linux/nospec.h> #include <linux/pkeys.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> #include <asm/fpu/api.h> #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> #include <asm/fpu/xcr.h> #include <asm/tlbflush.h> #include <asm/prctl.h> #include <asm/elf.h> #include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" #define for_each_extended_xfeature(bit, mask) \ (bit) = FIRST_EXTENDED_XFEATURE; \ for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) /* * Although we spell it out in here, the Processor Trace * xfeature is completely unused. We use other mechanisms * to save/restore PT state in Linux. */ static const char *xfeature_names[] = { "x87 floating point registers", "SSE registers", "AVX registers", "MPX bounds registers", "MPX CSR", "AVX-512 opmask", "AVX-512 Hi256", "AVX-512 ZMM_Hi256", "Processor Trace (unused)", "Protection Keys User registers", "PASID state", "Control-flow User registers", "Control-flow Kernel registers (unused)", "unknown xstate feature", "unknown xstate feature", "unknown xstate feature", "unknown xstate feature", "AMX Tile config", "AMX Tile data", "unknown xstate feature", }; static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_FP] = X86_FEATURE_FPU, [XFEATURE_SSE] = X86_FEATURE_XMM, [XFEATURE_YMM] = X86_FEATURE_AVX, [XFEATURE_BNDREGS] = X86_FEATURE_MPX, [XFEATURE_BNDCSR] = X86_FEATURE_MPX, [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, [XFEATURE_PKRU] = X86_FEATURE_OSPKE, [XFEATURE_PASID] = X86_FEATURE_ENQCMD, [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, }; static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; #define XSTATE_FLAG_SUPERVISOR BIT(0) #define XSTATE_FLAG_ALIGNED64 BIT(1) /* * Return whether the system supports a given xfeature. * * Also return the name of the (most advanced) feature that the caller requested: */ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) { u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; if (unlikely(feature_name)) { long xfeature_idx, max_idx; u64 xfeatures_print; /* * So we use FLS here to be able to print the most advanced * feature that was requested but is missing. So if a driver * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the * missing AVX feature - this is the most informative message * to users: */ if (xfeatures_missing) xfeatures_print = xfeatures_missing; else xfeatures_print = xfeatures_needed; xfeature_idx = fls64(xfeatures_print)-1; max_idx = ARRAY_SIZE(xfeature_names)-1; xfeature_idx = min(xfeature_idx, max_idx); *feature_name = xfeature_names[xfeature_idx]; } if (xfeatures_missing) return 0; return 1; } EXPORT_SYMBOL_GPL(cpu_has_xfeatures); static bool xfeature_is_aligned64(int xfeature_nr) { return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64; } static bool xfeature_is_supervisor(int xfeature_nr) { return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR; } static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) { unsigned int offs, i; /* * Non-compacted format and legacy features use the cached fixed * offsets. */ if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || xfeature <= XFEATURE_SSE) return xstate_offsets[xfeature]; /* * Compacted format offsets depend on the actual content of the * compacted xsave area which is determined by the xcomp_bv header * field. */ offs = FXSAVE_SIZE + XSAVE_HDR_SIZE; for_each_extended_xfeature(i, xcomp_bv) { if (xfeature_is_aligned64(i)) offs = ALIGN(offs, 64); if (i == xfeature) break; offs += xstate_sizes[i]; } return offs; } /* * Enable the extended processor state save/restore feature. * Called once per CPU onlining. */ void fpu__init_cpu_xstate(void) { if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) return; cr4_set_bits(X86_CR4_OSXSAVE); /* * Must happen after CR4 setup and before xsetbv() to allow KVM * lazy passthrough. Write independent of the dynamic state static * key as that does not work on the boot CPU. This also ensures * that any stale state is wiped out from XFD. */ if (cpu_feature_enabled(X86_FEATURE_XFD)) wrmsrl(MSR_IA32_XFD, init_fpstate.xfd); /* * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user * states can be set here. */ xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * MSR_IA32_XSS sets supervisor states managed by XSAVES. */ if (boot_cpu_has(X86_FEATURE_XSAVES)) { wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } } static bool xfeature_enabled(enum xfeature xfeature) { return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); } /* * Record the offsets and sizes of various xstates contained * in the XSAVE state memory layout. */ static void __init setup_xstate_cache(void) { u32 eax, ebx, ecx, edx, i; /* start at the beginning of the "extended state" */ unsigned int last_good_offset = offsetof(struct xregs_state, extended_state_area); /* * The FP xstates and SSE xstates are legacy states. They are always * in the fixed offsets in the xsave area in either compacted form * or standard form. */ xstate_offsets[XFEATURE_FP] = 0; xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, xmm_space); xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, xmm_space); for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); xstate_sizes[i] = eax; xstate_flags[i] = ecx; /* * If an xfeature is supervisor state, the offset in EBX is * invalid, leave it to -1. */ if (xfeature_is_supervisor(i)) continue; xstate_offsets[i] = ebx; /* * In our xstate size checks, we assume that the highest-numbered * xstate feature has the highest offset in the buffer. Ensure * it does. */ WARN_ONCE(last_good_offset > xstate_offsets[i], "x86/fpu: misordered xstate at %d\n", last_good_offset); last_good_offset = xstate_offsets[i]; } } static void __init print_xstate_feature(u64 xstate_mask) { const char *feature_name; if (cpu_has_xfeatures(xstate_mask, &feature_name)) pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name); } /* * Print out all the supported xstate features: */ static void __init print_xstate_features(void) { print_xstate_feature(XFEATURE_MASK_FP); print_xstate_feature(XFEATURE_MASK_SSE); print_xstate_feature(XFEATURE_MASK_YMM); print_xstate_feature(XFEATURE_MASK_BNDREGS); print_xstate_feature(XFEATURE_MASK_BNDCSR); print_xstate_feature(XFEATURE_MASK_OPMASK); print_xstate_feature(XFEATURE_MASK_ZMM_Hi256); print_xstate_feature(XFEATURE_MASK_Hi16_ZMM); print_xstate_feature(XFEATURE_MASK_PKRU); print_xstate_feature(XFEATURE_MASK_PASID); print_xstate_feature(XFEATURE_MASK_CET_USER); print_xstate_feature(XFEATURE_MASK_XTILE_CFG); print_xstate_feature(XFEATURE_MASK_XTILE_DATA); } /* * This check is important because it is easy to get XSTATE_* * confused with XSTATE_BIT_*. */ #define CHECK_XFEATURE(nr) do { \ WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ WARN_ON(nr >= XFEATURE_MAX); \ } while (0) /* * Print out xstate component offsets and sizes */ static void __init print_xstate_offset_size(void) { int i; for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, xfeature_get_offset(fpu_kernel_cfg.max_features, i), i, xstate_sizes[i]); } } /* * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ static __init void os_xrstor_booting(struct xregs_state *xstate) { u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; u32 lmask = mask; u32 hmask = mask >> 32; int err; if (cpu_feature_enabled(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); /* * We should never fault when copying from a kernel buffer, and the FPU * state we set at boot time should be valid. */ WARN_ON_FPU(err); } /* * All supported features have either init state all zeros or are * handled in setup_init_fpu() individually. This is an explicit * feature list and does not use XFEATURE_MASK*SUPPORTED to catch * newly added supported features at build time and make people * actually look at the init state for the new feature. */ #define XFEATURES_INIT_FPSTATE_HANDLED \ (XFEATURE_MASK_FP | \ XFEATURE_MASK_SSE | \ XFEATURE_MASK_YMM | \ XFEATURE_MASK_OPMASK | \ XFEATURE_MASK_ZMM_Hi256 | \ XFEATURE_MASK_Hi16_ZMM | \ XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_PASID | \ XFEATURE_MASK_CET_USER | \ XFEATURE_MASK_XTILE) /* * setup the xstate image representing the init state */ static void __init setup_init_fpu_buf(void) { BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED) != XFEATURES_INIT_FPSTATE_HANDLED); if (!boot_cpu_has(X86_FEATURE_XSAVE)) return; print_xstate_features(); xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures); /* * Init all the features state with header.xfeatures being 0x0 */ os_xrstor_booting(&init_fpstate.regs.xsave); /* * All components are now in init state. Read the state back so * that init_fpstate contains all non-zero init state. This only * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because * those use the init optimization which skips writing data for * components in init state. * * XSAVE could be used, but that would require to reshuffle the * data when XSAVEC/S is available because XSAVEC/S uses xstate * compaction. But doing so is a pointless exercise because most * components have an all zeros init state except for the legacy * ones (FP and SSE). Those can be saved with FXSAVE into the * legacy area. Adding new features requires to ensure that init * state is all zeroes or if not to add the necessary handling * here. */ fxsave(&init_fpstate.regs.fxsave); } int xfeature_size(int xfeature_nr) { u32 eax, ebx, ecx, edx; CHECK_XFEATURE(xfeature_nr); cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); return eax; } /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ static int validate_user_xstate_header(const struct xstate_header *hdr, struct fpstate *fpstate) { /* No unknown or supervisor features may be set */ if (hdr->xfeatures & ~fpstate->user_xfeatures) return -EINVAL; /* Userspace must use the uncompacted format */ if (hdr->xcomp_bv) return -EINVAL; /* * If 'reserved' is shrunken to add a new field, make sure to validate * that new field here! */ BUILD_BUG_ON(sizeof(hdr->reserved) != 48); /* No reserved bits may be set */ if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) return -EINVAL; return 0; } static void __init __xstate_dump_leaves(void) { int i; u32 eax, ebx, ecx, edx; static int should_dump = 1; if (!should_dump) return; should_dump = 0; /* * Dump out a few leaves past the ones that we support * just in case there are some goodies up there */ for (i = 0; i < XFEATURE_MAX + 10; i++) { cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", XSTATE_CPUID, i, eax, ebx, ecx, edx); } } #define XSTATE_WARN_ON(x, fmt, ...) do { \ if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \ __xstate_dump_leaves(); \ } \ } while (0) #define XCHECK_SZ(sz, nr, __struct) ({ \ if (WARN_ONCE(sz != sizeof(__struct), \ "[%s]: struct is %zu bytes, cpu state %d bytes\n", \ xfeature_names[nr], sizeof(__struct), sz)) { \ __xstate_dump_leaves(); \ } \ true; \ }) /** * check_xtile_data_against_struct - Check tile data state size. * * Calculate the state size by multiplying the single tile size which is * recorded in a C struct, and the number of tiles that the CPU informs. * Compare the provided size with the calculation. * * @size: The tile data state size * * Returns: 0 on success, -EINVAL on mismatch. */ static int __init check_xtile_data_against_struct(int size) { u32 max_palid, palid, state_size; u32 eax, ebx, ecx, edx; u16 max_tile; /* * Check the maximum palette id: * eax: the highest numbered palette subleaf. */ cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx); /* * Cross-check each tile size and find the maximum number of * supported tiles. */ for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { u16 tile_size, max; /* * Check the tile size info: * eax[31:16]: bytes per title * ebx[31:16]: the max names (or max number of tiles) */ cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx); tile_size = eax >> 16; max = ebx >> 16; if (tile_size != sizeof(struct xtile_data)) { pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", __stringify(XFEATURE_XTILE_DATA), sizeof(struct xtile_data), tile_size); __xstate_dump_leaves(); return -EINVAL; } if (max > max_tile) max_tile = max; } state_size = sizeof(struct xtile_data) * max_tile; if (size != state_size) { pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", __stringify(XFEATURE_XTILE_DATA), state_size, size); __xstate_dump_leaves(); return -EINVAL; } return 0; } /* * We have a C struct for each 'xstate'. We need to ensure * that our software representation matches what the CPU * tells us about the state's size. */ static bool __init check_xstate_against_struct(int nr) { /* * Ask the CPU for the size of the state. */ int sz = xfeature_size(nr); /* * Match each CPU state with the corresponding software * structure. */ switch (nr) { case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct); case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state); case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state); case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state); case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state); case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state); case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state); case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; default: XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); return false; } return true; } static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) { unsigned int topmost = fls64(xfeatures) - 1; unsigned int offset = xstate_offsets[topmost]; if (topmost <= XFEATURE_SSE) return sizeof(struct xregs_state); if (compacted) offset = xfeature_get_offset(xfeatures, topmost); return offset + xstate_sizes[topmost]; } /* * This essentially double-checks what the cpu told us about * how large the XSAVE buffer needs to be. We are recalculating * it to be safe. * * Independent XSAVE features allocate their own buffers and are not * covered by these checks. Only the size of the buffer for task->fpu * is checked here. */ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) { bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { if (!check_xstate_against_struct(i)) return false; /* * Supervisor state components can be managed only by * XSAVES. */ if (!xsaves && xfeature_is_supervisor(i)) { XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i); return false; } } size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); XSTATE_WARN_ON(size != kernel_size, "size %u != kernel_size %u\n", size, kernel_size); return size == kernel_size; } /* * Get total size of enabled xstates in XCR0 | IA32_XSS. * * Note the SDM's wording here. "sub-function 0" only enumerates * the size of the *user* states. If we use it to size a buffer * that we use 'XSAVES' on, we could potentially overflow the * buffer because 'XSAVES' saves system states too. * * This also takes compaction into account. So this works for * XSAVEC as well. */ static unsigned int __init get_compacted_size(void) { unsigned int eax, ebx, ecx, edx; /* * - CPUID function 0DH, sub-function 1: * EBX enumerates the size (in bytes) required by * the XSAVES instruction for an XSAVE area * containing all the state components * corresponding to bits currently set in * XCR0 | IA32_XSS. * * When XSAVES is not available but XSAVEC is (virt), then there * are no supervisor states, but XSAVEC still uses compacted * format. */ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); return ebx; } /* * Get the total size of the enabled xstates without the independent supervisor * features. */ static unsigned int __init get_xsave_compacted_size(void) { u64 mask = xfeatures_mask_independent(); unsigned int size; if (!mask) return get_compacted_size(); /* Disable independent features. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); /* * Ask the hardware what size is required of the buffer. * This is the size required for the task->fpu buffer. */ size = get_compacted_size(); /* Re-enable independent features so XSAVES will work on them again. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); return size; } static unsigned int __init get_xsave_size_user(void) { unsigned int eax, ebx, ecx, edx; /* * - CPUID function 0DH, sub-function 0: * EBX enumerates the size (in bytes) required by * the XSAVE instruction for an XSAVE area * containing all the *user* state components * corresponding to bits currently set in XCR0. */ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); return ebx; } static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ unsigned int user_size, kernel_size, kernel_default_size; bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); /* Uncompacted user space size */ user_size = get_xsave_size_user(); /* * XSAVES kernel size includes supervisor states and uses compacted * format. XSAVEC uses compacted format, but does not save * supervisor states. * * XSAVE[OPT] do not support supervisor states so kernel and user * size is identical. */ if (compacted) kernel_size = get_xsave_compacted_size(); else kernel_size = user_size; kernel_default_size = xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); if (!paranoid_xstate_size_valid(kernel_size)) return -EINVAL; fpu_kernel_cfg.max_size = kernel_size; fpu_user_cfg.max_size = user_size; fpu_kernel_cfg.default_size = kernel_default_size; fpu_user_cfg.default_size = xstate_calculate_size(fpu_user_cfg.default_features, false); return 0; } /* * We enabled the XSAVE hardware, but something went wrong and * we can not use it. Disable it. */ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) { fpu_kernel_cfg.max_features = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); /* Restore the legacy size.*/ fpu_kernel_cfg.max_size = legacy_size; fpu_kernel_cfg.default_size = legacy_size; fpu_user_cfg.max_size = legacy_size; fpu_user_cfg.default_size = legacy_size; /* * Prevent enabling the static branch which enables writes to the * XFD MSR. */ init_fpstate.xfd = 0; fpstate_reset(&current->thread.fpu); } /* * Enable and initialize the xsave feature. * Called once per system bootup. */ void __init fpu__init_system_xstate(unsigned int legacy_size) { unsigned int eax, ebx, ecx, edx; u64 xfeatures; int err; int i; if (!boot_cpu_has(X86_FEATURE_FPU)) { pr_info("x86/fpu: No FPU detected\n"); return; } if (!boot_cpu_has(X86_FEATURE_XSAVE)) { pr_info("x86/fpu: x87 FPU will use %s\n", boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); return; } if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { WARN_ON_FPU(1); return; } /* * Find user xstates supported by the processor. */ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); /* * Find supervisor xstates supported by the processor. */ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. Disable XSAVE and try to continue * booting without it. This is too early to BUG(). */ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", fpu_kernel_cfg.max_features); goto out_disable; } /* * Clear XSAVE features that are disabled in the normal CPUID. */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { unsigned short cid = xsave_cpuid_features[i]; /* Careful: X86_FEATURE_FPU is 0! */ if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) fpu_kernel_cfg.max_features &= ~BIT_ULL(i); } if (!cpu_feature_enabled(X86_FEATURE_XFD)) fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; else fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED; fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; /* Clean out dynamic features from default */ fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; fpu_user_cfg.default_features = fpu_user_cfg.max_features; fpu_user_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; /* Store it for paranoia check at the end */ xfeatures = fpu_kernel_cfg.max_features; /* * Initialize the default XFD state in initfp_state and enable the * dynamic sizing mechanism if dynamic states are available. The * static key cannot be enabled here because this runs before * jump_label_init(). This is delayed to an initcall. */ init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; /* Set up compaction feature bit */ if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || cpu_feature_enabled(X86_FEATURE_XSAVES)) setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); /* Cache size, offset and flags for initialization */ setup_xstate_cache(); err = init_xstate_size(); if (err) goto out_disable; /* Reset the state for the current task */ fpstate_reset(&current->thread.fpu); /* * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ update_regset_xstate_info(fpu_user_cfg.max_size, fpu_user_cfg.max_features); /* * init_fpstate excludes dynamic states as they are large but init * state is zero. */ init_fpstate.size = fpu_kernel_cfg.default_size; init_fpstate.xfeatures = fpu_kernel_cfg.default_features; if (init_fpstate.size > sizeof(init_fpstate.regs)) { pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", sizeof(init_fpstate.regs), init_fpstate.size); goto out_disable; } setup_init_fpu_buf(); /* * Paranoia check whether something in the setup modified the * xfeatures mask. */ if (xfeatures != fpu_kernel_cfg.max_features) { pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", xfeatures, fpu_kernel_cfg.max_features); goto out_disable; } /* * CPU capabilities initialization runs before FPU init. So * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely * functional, set the feature bit so depending code works. */ setup_force_cpu_cap(X86_FEATURE_OSXSAVE); print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", fpu_kernel_cfg.max_features, fpu_kernel_cfg.max_size, boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard"); return; out_disable: /* something went wrong, try to boot without any XSAVE support */ fpu__init_disable_system_xstate(legacy_size); } /* * Restore minimal FPU state after suspend: */ void fpu__resume_cpu(void) { /* * Restore XCR0 on xsave capable CPUs: */ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * Restore IA32_XSS. The same CPUID bit enumerates support * of XSAVES and MSR_IA32_XSS. */ if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } if (fpu_state_size_dynamic()) wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); } /* * Given an xstate feature nr, calculate where in the xsave * buffer the state is. Callers should ensure that the buffer * is valid. */ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { u64 xcomp_bv = xsave->header.xcomp_bv; if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) return NULL; } return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr); } /* * Given the xsave area and a state inside, this function returns the * address of the state. * * This is the API that is called to get xstate address in either * standard format or compacted format of xsave area. * * Note that if there is no data for the field in the xsave buffer * this will return NULL. * * Inputs: * xstate: the thread's storage area for all FPU data * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, * XFEATURE_SSE, etc...) * Output: * address of the state in the xsave area, or NULL if the * field is not present in the xsave buffer. */ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { /* * Do we even *have* xsave state? */ if (!boot_cpu_has(X86_FEATURE_XSAVE)) return NULL; /* * We should not ever be requesting features that we * have not enabled. */ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; /* * This assumes the last 'xsave*' instruction to * have requested that 'xfeature_nr' be saved. * If it did not, we might be seeing and old value * of the field in the buffer. * * This can happen because the last 'xsave' did not * request that this feature be saved (unlikely) * or because the "init optimization" caused it * to not be saved. */ if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) return NULL; return __raw_xsave_addr(xsave, xfeature_nr); } #ifdef CONFIG_ARCH_HAS_PKEYS /* * This will go out and modify PKRU register to set the access * rights for @pkey to @init_val. */ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) { u32 old_pkru, new_pkru_bits = 0; int pkey_shift; /* * This check implies XSAVE support. OSPKE only gets * set if we enable XSAVE and we enable PKU in XCR0. */ if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return -EINVAL; /* * This code should only be called with valid 'pkey' * values originating from in-kernel users. Complain * if a bad value is observed. */ if (WARN_ON_ONCE(pkey >= arch_max_pkey())) return -EINVAL; /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) new_pkru_bits |= PKRU_AD_BIT; if (init_val & PKEY_DISABLE_WRITE) new_pkru_bits |= PKRU_WD_BIT; /* Shift the bits in to the correct place in PKRU for pkey: */ pkey_shift = pkey * PKRU_BITS_PER_PKEY; new_pkru_bits <<= pkey_shift; /* Get old PKRU and mask off any old bits in place: */ old_pkru = read_pkru(); old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); /* Write old part along with new part: */ write_pkru(old_pkru | new_pkru_bits); return 0; } #endif /* ! CONFIG_ARCH_HAS_PKEYS */ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, void *init_xstate, unsigned int size) { membuf_write(to, from_xstate ? xstate : init_xstate, size); } /** * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor * @fpstate: The fpstate buffer from which to copy * @pkru_val: The PKRU value to store in the PKRU component * @copy_mode: The requested copy mode * * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming * format, i.e. from the kernel internal hardware dependent storage format * to the requested @mode. UABI XSTATE is always uncompacted! * * It supports partial copy but @to.pos always starts from zero. */ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, u32 pkru_val, enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); struct xregs_state *xinit = &init_fpstate.regs.xsave; struct xregs_state *xsave = &fpstate->regs.xsave; struct xstate_header header; unsigned int zerofrom; u64 mask; int i; memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; /* Mask out the feature bits depending on copy mode */ switch (copy_mode) { case XSTATE_COPY_FP: header.xfeatures &= XFEATURE_MASK_FP; break; case XSTATE_COPY_FX: header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE; break; case XSTATE_COPY_XSAVE: header.xfeatures &= fpstate->user_xfeatures; break; } /* Copy FP state up to MXCSR */ copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387, &xinit->i387, off_mxcsr); /* Copy MXCSR when SSE or YMM are set in the feature mask */ copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM), &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr, MXCSR_AND_FLAGS_SIZE); /* Copy the remaining FP state */ copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387.st_space, &xinit->i387.st_space, sizeof(xsave->i387.st_space)); /* Copy the SSE state - shared with YMM, but independently managed */ copy_feature(header.xfeatures & XFEATURE_MASK_SSE, &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space, sizeof(xsave->i387.xmm_space)); if (copy_mode != XSTATE_COPY_XSAVE) goto out; /* Zero the padding area */ membuf_zero(&to, sizeof(xsave->i387.padding)); /* Copy xsave->i387.sw_reserved */ membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved)); /* Copy the user space relevant state of @xsave->header */ membuf_write(&to, &header, sizeof(header)); zerofrom = offsetof(struct xregs_state, extended_state_area); /* * This 'mask' indicates which states to copy from fpstate. * Those extended states that are not present in fpstate are * either disabled or initialized: * * In non-compacted format, disabled features still occupy * state space but there is no state to copy from in the * compacted init_fpstate. The gap tracking will zero these * states. * * The extended features have an all zeroes init state. Thus, * remove them from 'mask' to zero those features in the user * buffer instead of retrieving them from init_fpstate. */ mask = header.xfeatures; for_each_extended_xfeature(i, mask) { /* * If there was a feature or alignment gap, zero the space * in the destination buffer. */ if (zerofrom < xstate_offsets[i]) membuf_zero(&to, xstate_offsets[i] - zerofrom); if (i == XFEATURE_PKRU) { struct pkru_state pkru = {0}; /* * PKRU is not necessarily up to date in the * XSAVE buffer. Use the provided value. */ pkru.pkru = pkru_val; membuf_write(&to, &pkru, sizeof(pkru)); } else { membuf_write(&to, __raw_xsave_addr(xsave, i), xstate_sizes[i]); } /* * Keep track of the last copied state in the non-compacted * target buffer for gap zeroing. */ zerofrom = xstate_offsets[i] + xstate_sizes[i]; } out: if (to.left) membuf_zero(&to, to.left); } /** * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor * @tsk: The task from which to copy the saved xstate * @copy_mode: The requested copy mode * * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming * format, i.e. from the kernel internal hardware dependent storage format * to the requested @mode. UABI XSTATE is always uncompacted! * * It supports partial copy but @to.pos always starts from zero. */ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, tsk->thread.pkru, copy_mode); } static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, const void *kbuf, const void __user *ubuf) { if (kbuf) { memcpy(dst, kbuf + offset, size); } else { if (copy_from_user(dst, ubuf + offset, size)) return -EFAULT; } return 0; } /** * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate * @fpstate: The fpstate buffer to copy to * @kbuf: The UABI format buffer, if it comes from the kernel * @ubuf: The UABI format buffer, if it comes from userspace * @pkru: The location to write the PKRU value to * * Converts from the UABI format into the kernel internal hardware * dependent format. * * This function ultimately has three different callers with distinct PKRU * behavior. * 1. When called from sigreturn the PKRU register will be restored from * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to * @fpstate is sufficient to cover this case, but the caller will also * pass a pointer to the thread_struct's pkru field in @pkru and updating * it is harmless. * 2. When called from ptrace the PKRU register will be restored from the * thread_struct's pkru field. A pointer to that is passed in @pkru. * The kernel will restore it manually, so the XRSTOR behavior that resets * the PKRU register to the hardware init value (0) if the corresponding * xfeatures bit is not set is emulated here. * 3. When called from KVM the PKRU register will be restored from the vcpu's * pkru field. A pointer to that is passed in @pkru. KVM hasn't used * XRSTOR and hasn't had the PKRU resetting behavior described above. To * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures * bit is not set. */ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, const void __user *ubuf, u32 *pkru) { struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int offset, size; struct xstate_header hdr; u64 mask; int i; offset = offsetof(struct xregs_state, header); if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) return -EFAULT; if (validate_user_xstate_header(&hdr, fpstate)) return -EINVAL; /* Validate MXCSR when any of the related features is in use */ mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; if (hdr.xfeatures & mask) { u32 mxcsr[2]; offset = offsetof(struct fxregs_state, mxcsr); if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf)) return -EFAULT; /* Reserved bits in MXCSR must be zero. */ if (mxcsr[0] & ~mxcsr_feature_mask) return -EINVAL; /* SSE and YMM require MXCSR even when FP is not in use. */ if (!(hdr.xfeatures & XFEATURE_MASK_FP)) { xsave->i387.mxcsr = mxcsr[0]; xsave->i387.mxcsr_mask = mxcsr[1]; } } for (i = 0; i < XFEATURE_MAX; i++) { mask = BIT_ULL(i); if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; if (copy_from_buffer(dst, offset, size, kbuf, ubuf)) return -EFAULT; } } if (hdr.xfeatures & XFEATURE_MASK_PKRU) { struct pkru_state *xpkru; xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU); *pkru = xpkru->pkru; } else { /* * KVM may pass NULL here to indicate that it does not need * PKRU updated. */ if (pkru) *pkru = 0; } /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': */ xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; /* * Add back in the features that came in from userspace: */ xsave->header.xfeatures |= hdr.xfeatures; return 0; } /* * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] * format and copy to the target thread. Used by ptrace and KVM. */ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru) { return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru); } /* * Convert from a sigreturn standard-format user-space buffer to kernel * XSAVE[S] format and copy to the target thread. This is called from the * sigreturn() and rt_sigreturn() system calls. */ int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf) { return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru); } static bool validate_independent_components(u64 mask) { u64 xchk; if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) return false; xchk = ~xfeatures_mask_independent(); if (WARN_ON_ONCE(!mask || mask & xchk)) return false; return true; } /** * xsaves - Save selected components to a kernel xstate buffer * @xstate: Pointer to the buffer * @mask: Feature mask to select the components to save * * The @xstate buffer must be 64 byte aligned and correctly initialized as * XSAVES does not write the full xstate header. Before first use the * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer * can #GP. * * The feature mask must be a subset of the independent features. */ void xsaves(struct xregs_state *xstate, u64 mask) { int err; if (!validate_independent_components(mask)) return; XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); WARN_ON_ONCE(err); } /** * xrstors - Restore selected components from a kernel xstate buffer * @xstate: Pointer to the buffer * @mask: Feature mask to select the components to restore * * The @xstate buffer must be 64 byte aligned and correctly initialized * otherwise XRSTORS from that buffer can #GP. * * Proper usage is to restore the state which was saved with * xsaves() into @xstate. * * The feature mask must be a subset of the independent features. */ void xrstors(struct xregs_state *xstate, u64 mask) { int err; if (!validate_independent_components(mask)) return; XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); WARN_ON_ONCE(err); } #if IS_ENABLED(CONFIG_KVM) void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) { void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); if (addr) memset(addr, 0, xstate_sizes[xfeature]); } EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); #endif #ifdef CONFIG_X86_64 #ifdef CONFIG_X86_DEBUG_FPU /* * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask * can safely operate on the @fpstate buffer. */ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) { u64 xfd = __this_cpu_read(xfd_state); if (fpstate->xfd == xfd) return true; /* * The XFD MSR does not match fpstate->xfd. That's invalid when * the passed in fpstate is current's fpstate. */ if (fpstate->xfd == current->thread.fpu.fpstate->xfd) return false; /* * XRSTOR(S) from init_fpstate are always correct as it will just * bring all components into init state and not read from the * buffer. XSAVE(S) raises #PF after init. */ if (fpstate == &init_fpstate) return rstor; /* * XSAVE(S): clone(), fpu_swap_kvm_fpu() * XRSTORS(S): fpu_swap_kvm_fpu() */ /* * No XSAVE/XRSTOR instructions (except XSAVE itself) touch * the buffer area for XFD-disabled state components. */ mask &= ~xfd; /* * Remove features which are valid in fpstate. They * have space allocated in fpstate. */ mask &= ~fpstate->xfeatures; /* * Any remaining state components in 'mask' might be written * by XSAVE/XRSTOR. Fail validation it found. */ return !mask; } void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); } #endif /* CONFIG_X86_DEBUG_FPU */ static int __init xfd_update_static_branch(void) { /* * If init_fpstate.xfd has bits set then dynamic features are * available and the dynamic sizing must be enabled. */ if (init_fpstate.xfd) static_branch_enable(&__fpu_state_size_dynamic); return 0; } arch_initcall(xfd_update_static_branch) void fpstate_free(struct fpu *fpu) { if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) vfree(fpu->fpstate); } /** * fpstate_realloc - Reallocate struct fpstate for the requested new features * * @xfeatures: A bitmap of xstate features which extend the enabled features * of that task * @ksize: The required size for the kernel buffer * @usize: The required size for user space buffers * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations * * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer * terminates quickly, vfree()-induced IPIs may be a concern, but tasks * with large states are likely to live longer. * * Returns: 0 on success, -ENOMEM on allocation error. */ static int fpstate_realloc(u64 xfeatures, unsigned int ksize, unsigned int usize, struct fpu_guest *guest_fpu) { struct fpu *fpu = &current->thread.fpu; struct fpstate *curfps, *newfps = NULL; unsigned int fpsize; bool in_use; fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); newfps = vzalloc(fpsize); if (!newfps) return -ENOMEM; newfps->size = ksize; newfps->user_size = usize; newfps->is_valloc = true; /* * When a guest FPU is supplied, use @guest_fpu->fpstate * as reference independent whether it is in use or not. */ curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate; /* Determine whether @curfps is the active fpstate */ in_use = fpu->fpstate == curfps; if (guest_fpu) { newfps->is_guest = true; newfps->is_confidential = curfps->is_confidential; newfps->in_use = curfps->in_use; guest_fpu->xfeatures |= xfeatures; guest_fpu->uabi_size = usize; } fpregs_lock(); /* * If @curfps is in use, ensure that the current state is in the * registers before swapping fpstate as that might invalidate it * due to layout changes. */ if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); newfps->xfeatures = curfps->xfeatures | xfeatures; if (!guest_fpu) newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; newfps->xfd = curfps->xfd & ~xfeatures; /* Do the final updates within the locked region */ xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); if (guest_fpu) { guest_fpu->fpstate = newfps; /* If curfps is active, update the FPU fpstate pointer */ if (in_use) fpu->fpstate = newfps; } else { fpu->fpstate = newfps; } if (in_use) xfd_update_state(fpu->fpstate); fpregs_unlock(); /* Only free valloc'ed state */ if (curfps && curfps->is_valloc) vfree(curfps); return 0; } static int validate_sigaltstack(unsigned int usize) { struct task_struct *thread, *leader = current->group_leader; unsigned long framesize = get_sigframe_size(); lockdep_assert_held(&current->sighand->siglock); /* get_sigframe_size() is based on fpu_user_cfg.max_size */ framesize -= fpu_user_cfg.max_size; framesize += usize; for_each_thread(leader, thread) { if (thread->sas_ss_size && thread->sas_ss_size < framesize) return -ENOSPC; } return 0; } static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) { /* * This deliberately does not exclude !XSAVES as we still might * decide to optionally context switch XCR0 or talk the silicon * vendors into extending XFD for the pre AMX states, especially * AVX512. */ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); struct fpu *fpu = &current->group_leader->thread.fpu; struct fpu_state_perm *perm; unsigned int ksize, usize; u64 mask; int ret = 0; /* Check whether fully enabled */ if ((permitted & requested) == requested) return 0; /* Calculate the resulting kernel state size */ mask = permitted | requested; /* Take supervisor states into account on the host */ if (!guest) mask |= xfeatures_mask_supervisor(); ksize = xstate_calculate_size(mask, compacted); /* Calculate the resulting user state size */ mask &= XFEATURE_MASK_USER_SUPPORTED; usize = xstate_calculate_size(mask, false); if (!guest) { ret = validate_sigaltstack(usize); if (ret) return ret; } perm = guest ? &fpu->guest_perm : &fpu->perm; /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ WRITE_ONCE(perm->__state_perm, mask); /* Protected by sighand lock */ perm->__state_size = ksize; perm->__user_state_size = usize; return ret; } /* * Permissions array to map facilities with more than one component */ static const u64 xstate_prctl_req[XFEATURE_MAX] = { [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, }; static int xstate_request_perm(unsigned long idx, bool guest) { u64 permitted, requested; int ret; if (idx >= XFEATURE_MAX) return -EINVAL; /* * Look up the facility mask which can require more than * one xstate component. */ idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); requested = xstate_prctl_req[idx]; if (!requested) return -EOPNOTSUPP; if ((fpu_user_cfg.max_features & requested) != requested) return -EOPNOTSUPP; /* Lockless quick check */ permitted = xstate_get_group_perm(guest); if ((permitted & requested) == requested) return 0; /* Protect against concurrent modifications */ spin_lock_irq(&current->sighand->siglock); permitted = xstate_get_group_perm(guest); /* First vCPU allocation locks the permissions. */ if (guest && (permitted & FPU_GUEST_PERM_LOCKED)) ret = -EBUSY; else ret = __xstate_request_perm(permitted, requested, guest); spin_unlock_irq(&current->sighand->siglock); return ret; } int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) { u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; struct fpu_state_perm *perm; unsigned int ksize, usize; struct fpu *fpu; if (!xfd_event) { if (!guest_fpu) pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); return 0; } /* Protect against concurrent modifications */ spin_lock_irq(&current->sighand->siglock); /* If not permitted let it die */ if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) { spin_unlock_irq(&current->sighand->siglock); return -EPERM; } fpu = &current->group_leader->thread.fpu; perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; ksize = perm->__state_size; usize = perm->__user_state_size; /* * The feature is permitted. State size is sufficient. Dropping * the lock is safe here even if more features are added from * another task, the retrieved buffer sizes are valid for the * currently requested feature(s). */ spin_unlock_irq(&current->sighand->siglock); /* * Try to allocate a new fpstate. If that fails there is no way * out. */ if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu)) return -EFAULT; return 0; } int xfd_enable_feature(u64 xfd_err) { return __xfd_enable_feature(xfd_err, NULL); } #else /* CONFIG_X86_64 */ static inline int xstate_request_perm(unsigned long idx, bool guest) { return -EPERM; } #endif /* !CONFIG_X86_64 */ u64 xstate_get_guest_group_perm(void) { return xstate_get_group_perm(true); } EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); /** * fpu_xstate_prctl - xstate permission operations * @tsk: Redundant pointer to current * @option: A subfunction of arch_prctl() * @arg2: option argument * Return: 0 if successful; otherwise, an error code * * Option arguments: * * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info * ARCH_REQ_XCOMP_PERM: Facility number requested * * For facilities which require more than one XSTATE component, the request * must be the highest state component number related to that facility, * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). */ long fpu_xstate_prctl(int option, unsigned long arg2) { u64 __user *uptr = (u64 __user *)arg2; u64 permitted, supported; unsigned long idx = arg2; bool guest = false; switch (option) { case ARCH_GET_XCOMP_SUPP: supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; return put_user(supported, uptr); case ARCH_GET_XCOMP_PERM: /* * Lockless snapshot as it can also change right after the * dropping the lock. */ permitted = xstate_get_host_group_perm(); permitted &= XFEATURE_MASK_USER_SUPPORTED; return put_user(permitted, uptr); case ARCH_GET_XCOMP_GUEST_PERM: permitted = xstate_get_guest_group_perm(); permitted &= XFEATURE_MASK_USER_SUPPORTED; return put_user(permitted, uptr); case ARCH_REQ_XCOMP_GUEST_PERM: guest = true; fallthrough; case ARCH_REQ_XCOMP_PERM: if (!IS_ENABLED(CONFIG_X86_64)) return -EOPNOTSUPP; return xstate_request_perm(idx, guest); default: return -EINVAL; } } #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 * use in the task. */ static void avx512_status(struct seq_file *m, struct task_struct *task) { unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); long delta; if (!timestamp) { /* * Report -1 if no AVX512 usage */ delta = -1; } else { delta = (long)(jiffies - timestamp); /* * Cap to LONG_MAX if time difference > LONG_MAX */ if (delta < 0) delta = LONG_MAX; delta = jiffies_to_msecs(delta); } seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta); seq_putc(m, '\n'); } /* * Report architecture specific information */ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { /* * Report AVX512 state if the processor and build option supported. */ if (cpu_feature_enabled(X86_FEATURE_AVX512F)) avx512_status(m, task); return 0; } #endif /* CONFIG_PROC_PID_ARCH_STATUS */
9 6 13 3 3 3 13 13 11 9 8 6 13 2 2 2 1 1 1 2 204 204 1 204 85 1 117 8 204 15 2 186 6 4 203 250 202 252 63 7 56 18 7 7 7 7 227 1 3 1 2 227 227 35 205 205 204 18 18 204 198 198 130 1 129 198 1 197 67 3 3 3 3 7 7 7 7 6 7 5 5 5 5 7 3 4 4 3 2 2 2 1 14 29 29 30 2 1 2 1 2 1 89 101 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/in6.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/netfilter_ipv4.h> #include <linux/etherdevice.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/rculist.h> #include <linux/err.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/protocol.h> #include <net/ip_tunnels.h> #include <net/arp.h> #include <net/checksum.h> #include <net/dsfield.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/udp.h> #include <net/dst_metadata.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #endif static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) { return hash_32((__force u32)key ^ (__force u32)remote, IP_TNL_HASH_BITS); } static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, __be16 flags, __be32 key) { if (p->i_flags & TUNNEL_KEY) { if (flags & TUNNEL_KEY) return key == p->i_key; else /* key expected, none present */ return false; } else return !(flags & TUNNEL_KEY); } /* Fallback tunnel: no source, no destination, no key, no options Tunnel hash table: We require exact key match i.e. if a key is present in packet it will match only tunnel with the same key; if it is not present, it will match only keyless tunnel. All keysless packets, if not matched configured keyless tunnels will match fallback tunnel. Given src, dst and key, find appropriate for input tunnel. */ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, int link, __be16 flags, __be32 remote, __be32 local, __be32 key) { struct ip_tunnel *t, *cand = NULL; struct hlist_head *head; struct net_device *ndev; unsigned int hash; hash = ip_tunnel_hash(key, remote); head = &itn->tunnels[hash]; hlist_for_each_entry_rcu(t, head, hash_node) { if (local != t->parms.iph.saddr || remote != t->parms.iph.daddr || !(t->dev->flags & IFF_UP)) continue; if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; if (t->parms.link == link) return t; else cand = t; } hlist_for_each_entry_rcu(t, head, hash_node) { if (remote != t->parms.iph.daddr || t->parms.iph.saddr != 0 || !(t->dev->flags & IFF_UP)) continue; if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; if (t->parms.link == link) return t; else if (!cand) cand = t; } hash = ip_tunnel_hash(key, 0); head = &itn->tunnels[hash]; hlist_for_each_entry_rcu(t, head, hash_node) { if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) continue; if (!(t->dev->flags & IFF_UP)) continue; if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; if (t->parms.link == link) return t; else if (!cand) cand = t; } hlist_for_each_entry_rcu(t, head, hash_node) { if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) || t->parms.iph.saddr != 0 || t->parms.iph.daddr != 0 || !(t->dev->flags & IFF_UP)) continue; if (t->parms.link == link) return t; else if (!cand) cand = t; } if (cand) return cand; t = rcu_dereference(itn->collect_md_tun); if (t && t->dev->flags & IFF_UP) return t; ndev = READ_ONCE(itn->fb_tunnel_dev); if (ndev && ndev->flags & IFF_UP) return netdev_priv(ndev); return NULL; } EXPORT_SYMBOL_GPL(ip_tunnel_lookup); static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, struct ip_tunnel_parm *parms) { unsigned int h; __be32 remote; __be32 i_key = parms->i_key; if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) remote = parms->iph.daddr; else remote = 0; if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) i_key = 0; h = ip_tunnel_hash(i_key, remote); return &itn->tunnels[h]; } static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) { struct hlist_head *head = ip_bucket(itn, &t->parms); if (t->collect_md) rcu_assign_pointer(itn->collect_md_tun, t); hlist_add_head_rcu(&t->hash_node, head); } static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) { if (t->collect_md) rcu_assign_pointer(itn->collect_md_tun, NULL); hlist_del_init_rcu(&t->hash_node); } static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, struct ip_tunnel_parm *parms, int type) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; __be32 key = parms->i_key; __be16 flags = parms->i_flags; int link = parms->link; struct ip_tunnel *t = NULL; struct hlist_head *head = ip_bucket(itn, parms); hlist_for_each_entry_rcu(t, head, hash_node) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && link == t->parms.link && type == t->dev->type && ip_tunnel_key_match(&t->parms, flags, key)) break; } return t; } static struct net_device *__ip_tunnel_create(struct net *net, const struct rtnl_link_ops *ops, struct ip_tunnel_parm *parms) { int err; struct ip_tunnel *tunnel; struct net_device *dev; char name[IFNAMSIZ]; err = -E2BIG; if (parms->name[0]) { if (!dev_valid_name(parms->name)) goto failed; strscpy(name, parms->name, IFNAMSIZ); } else { if (strlen(ops->kind) > (IFNAMSIZ - 3)) goto failed; strcpy(name, ops->kind); strcat(name, "%d"); } ASSERT_RTNL(); dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); if (!dev) { err = -ENOMEM; goto failed; } dev_net_set(dev, net); dev->rtnl_link_ops = ops; tunnel = netdev_priv(dev); tunnel->parms = *parms; tunnel->net = net; err = register_netdevice(dev); if (err) goto failed_free; return dev; failed_free: free_netdev(dev); failed: return ERR_PTR(err); } static int ip_tunnel_bind_dev(struct net_device *dev) { struct net_device *tdev = NULL; struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *iph; int hlen = LL_MAX_HEADER; int mtu = ETH_DATA_LEN; int t_hlen = tunnel->hlen + sizeof(struct iphdr); iph = &tunnel->parms.iph; /* Guess output device to choose reasonable mtu and needed_headroom */ if (iph->daddr) { struct flowi4 fl4; struct rtable *rt; ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, RT_TOS(iph->tos), dev_net(dev), tunnel->parms.link, tunnel->fwmark, 0, 0); rt = ip_route_output_key(tunnel->net, &fl4); if (!IS_ERR(rt)) { tdev = rt->dst.dev; ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) dev->flags |= IFF_POINTOPOINT; dst_cache_reset(&tunnel->dst_cache); } if (!tdev && tunnel->parms.link) tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); if (tdev) { hlen = tdev->hard_header_len + tdev->needed_headroom; mtu = min(tdev->mtu, IP_MAX_MTU); } dev->needed_headroom = t_hlen + hlen; mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0); if (mtu < IPV4_MIN_MTU) mtu = IPV4_MIN_MTU; return mtu; } static struct ip_tunnel *ip_tunnel_create(struct net *net, struct ip_tunnel_net *itn, struct ip_tunnel_parm *parms) { struct ip_tunnel *nt; struct net_device *dev; int t_hlen; int mtu; int err; dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms); if (IS_ERR(dev)) return ERR_CAST(dev); mtu = ip_tunnel_bind_dev(dev); err = dev_set_mtu(dev, mtu); if (err) goto err_dev_set_mtu; nt = netdev_priv(dev); t_hlen = nt->hlen + sizeof(struct iphdr); dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP_MAX_MTU - t_hlen; if (dev->type == ARPHRD_ETHER) dev->max_mtu -= dev->hard_header_len; ip_tunnel_add(itn, nt); return nt; err_dev_set_mtu: unregister_netdevice(dev); return ERR_PTR(err); } void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info) { const struct iphdr *iph = ip_hdr(skb); const struct udphdr *udph; if (iph->protocol != IPPROTO_UDP) return; udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2)); info->encap.sport = udph->source; info->encap.dport = udph->dest; } EXPORT_SYMBOL(ip_tunnel_md_udp_encap); int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error) { const struct iphdr *iph = ip_hdr(skb); int err; #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { DEV_STATS_INC(tunnel->dev, multicast); skb->pkt_type = PACKET_BROADCAST; } #endif if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { DEV_STATS_INC(tunnel->dev, rx_crc_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } if (tunnel->parms.i_flags&TUNNEL_SEQ) { if (!(tpi->flags&TUNNEL_SEQ) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { DEV_STATS_INC(tunnel->dev, rx_fifo_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } tunnel->i_seqno = ntohl(tpi->seq) + 1; } skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { if (log_ecn_error) net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", &iph->saddr, iph->tos); if (err > 1) { DEV_STATS_INC(tunnel->dev, rx_frame_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } } dev_sw_netstats_rx_add(tunnel->dev, skb->len); skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); if (tunnel->dev->type == ARPHRD_ETHER) { skb->protocol = eth_type_trans(skb, tunnel->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } else { skb->dev = tunnel->dev; } if (tun_dst) skb_dst_set(skb, (struct dst_entry *)tun_dst); gro_cells_receive(&tunnel->gro_cells, skb); return 0; drop: if (tun_dst) dst_release((struct dst_entry *)tun_dst); kfree_skb(skb); return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_rcv); int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, unsigned int num) { if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; return !cmpxchg((const struct ip_tunnel_encap_ops **) &iptun_encaps[num], NULL, ops) ? 0 : -1; } EXPORT_SYMBOL(ip_tunnel_encap_add_ops); int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops, unsigned int num) { int ret; if (num >= MAX_IPTUN_ENCAP_OPS) return -ERANGE; ret = (cmpxchg((const struct ip_tunnel_encap_ops **) &iptun_encaps[num], ops, NULL) == ops) ? 0 : -1; synchronize_net(); return ret; } EXPORT_SYMBOL(ip_tunnel_encap_del_ops); int ip_tunnel_encap_setup(struct ip_tunnel *t, struct ip_tunnel_encap *ipencap) { int hlen; memset(&t->encap, 0, sizeof(t->encap)); hlen = ip_encap_hlen(ipencap); if (hlen < 0) return hlen; t->encap.type = ipencap->type; t->encap.sport = ipencap->sport; t->encap.dport = ipencap->dport; t->encap.flags = ipencap->flags; t->encap_hlen = hlen; t->hlen = t->encap_hlen + t->tun_hlen; return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, struct rtable *rt, __be16 df, const struct iphdr *inner_iph, int tunnel_hlen, __be32 dst, bool md) { struct ip_tunnel *tunnel = netdev_priv(dev); int pkt_size; int mtu; tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; pkt_size = skb->len - tunnel_hlen; pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; if (df) { mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen); mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; } else { mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; } if (skb_valid_dst(skb)) skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { if (!skb_is_gso(skb) && (inner_iph->frag_off & htons(IP_DF)) && mtu < pkt_size) { icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); return -E2BIG; } } #if IS_ENABLED(CONFIG_IPV6) else if (skb->protocol == htons(ETH_P_IPV6)) { struct rt6_info *rt6; __be32 daddr; rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) : NULL; daddr = md ? dst : tunnel->parms.iph.daddr; if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { if ((daddr && !ipv4_is_multicast(daddr)) || rt6->rt6i_dst.plen == 128) { rt6->rt6i_flags |= RTF_MODIFIED; dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); } } if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && mtu < pkt_size) { icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); return -E2BIG; } } #endif return 0; } void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto, int tunnel_hlen) { struct ip_tunnel *tunnel = netdev_priv(dev); u32 headroom = sizeof(struct iphdr); struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; const struct iphdr *inner_iph; struct rtable *rt = NULL; struct flowi4 fl4; __be16 df = 0; u8 tos, ttl; bool use_cache; tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || ip_tunnel_info_af(tun_info) != AF_INET)) goto tx_error; key = &tun_info->key; memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); inner_iph = (const struct iphdr *)skb_inner_network_header(skb); tos = key->tos; if (tos == 1) { if (skb->protocol == htons(ETH_P_IP)) tos = inner_iph->tos; else if (skb->protocol == htons(ETH_P_IPV6)) tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); } ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), RT_TOS(tos), dev_net(dev), 0, skb->mark, skb_get_hash(skb), key->flow_flags); if (!tunnel_hlen) tunnel_hlen = ip_encap_hlen(&tun_info->encap); if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0) goto tx_error; use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); if (use_cache) rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error; } if (use_cache) dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, fl4.saddr); } if (rt->dst.dev == dev) { ip_rt_put(rt); DEV_STATS_INC(dev, collisions); goto tx_error; } if (key->tun_flags & TUNNEL_DONT_FRAGMENT) df = htons(IP_DF); if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, key->u.ipv4.dst, true)) { ip_rt_put(rt); goto tx_error; } tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); ttl = key->ttl; if (ttl == 0) { if (skb->protocol == htons(ETH_P_IP)) ttl = inner_iph->ttl; else if (skb->protocol == htons(ETH_P_IPV6)) ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; else ttl = ip4_dst_hoplimit(&rt->dst); } headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; if (headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, headroom); if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { ip_rt_put(rt); goto tx_dropped; } iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); return; tx_error: DEV_STATS_INC(dev, tx_errors); goto kfree; tx_dropped: DEV_STATS_INC(dev, tx_dropped); kfree: kfree_skb(skb); } EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, u8 protocol) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_info *tun_info = NULL; const struct iphdr *inner_iph; unsigned int max_headroom; /* The extra header space needed */ struct rtable *rt = NULL; /* Route to the other host */ __be16 payload_protocol; bool use_cache = false; struct flowi4 fl4; bool md = false; bool connected; u8 tos, ttl; __be32 dst; __be16 df; inner_iph = (const struct iphdr *)skb_inner_network_header(skb); connected = (tunnel->parms.iph.daddr != 0); payload_protocol = skb_protocol(skb, true); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); dst = tnl_params->daddr; if (dst == 0) { /* NBMA tunnel */ if (!skb_dst(skb)) { DEV_STATS_INC(dev, tx_fifo_errors); goto tx_error; } tun_info = skb_tunnel_info(skb); if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) && ip_tunnel_info_af(tun_info) == AF_INET && tun_info->key.u.ipv4.dst) { dst = tun_info->key.u.ipv4.dst; md = true; connected = true; } else if (payload_protocol == htons(ETH_P_IP)) { rt = skb_rtable(skb); dst = rt_nexthop(rt, inner_iph->daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (payload_protocol == htons(ETH_P_IPV6)) { const struct in6_addr *addr6; struct neighbour *neigh; bool do_tx_error_icmp; int addr_type; neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); if (!neigh) goto tx_error; addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { addr6 = &ipv6_hdr(skb)->daddr; addr_type = ipv6_addr_type(addr6); } if ((addr_type & IPV6_ADDR_COMPATv4) == 0) do_tx_error_icmp = true; else { do_tx_error_icmp = false; dst = addr6->s6_addr32[3]; } neigh_release(neigh); if (do_tx_error_icmp) goto tx_error_icmp; } #endif else goto tx_error; if (!md) connected = false; } tos = tnl_params->tos; if (tos & 0x1) { tos &= ~0x1; if (payload_protocol == htons(ETH_P_IP)) { tos = inner_iph->tos; connected = false; } else if (payload_protocol == htons(ETH_P_IPV6)) { tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); connected = false; } } ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), dev_net(dev), tunnel->parms.link, tunnel->fwmark, skb_get_hash(skb), 0); if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) goto tx_error; if (connected && md) { use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); if (use_cache) rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); } else { rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) : NULL; } if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error; } if (use_cache) dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, fl4.saddr); else if (!md && connected) dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr); } if (rt->dst.dev == dev) { ip_rt_put(rt); DEV_STATS_INC(dev, collisions); goto tx_error; } df = tnl_params->frag_off; if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df) df |= (inner_iph->frag_off & htons(IP_DF)); if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) { ip_rt_put(rt); goto tx_error; } if (tunnel->err_count > 0) { if (time_before(jiffies, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; dst_link_failure(skb); } else tunnel->err_count = 0; } tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); ttl = tnl_params->ttl; if (ttl == 0) { if (payload_protocol == htons(ETH_P_IP)) ttl = inner_iph->ttl; #if IS_ENABLED(CONFIG_IPV6) else if (payload_protocol == htons(ETH_P_IPV6)) ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; #endif else ttl = ip4_dst_hoplimit(&rt->dst); } max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); if (max_headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, max_headroom); if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { ip_rt_put(rt); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return; } iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); return; #if IS_ENABLED(CONFIG_IPV6) tx_error_icmp: dst_link_failure(skb); #endif tx_error: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); } EXPORT_SYMBOL_GPL(ip_tunnel_xmit); static void ip_tunnel_update(struct ip_tunnel_net *itn, struct ip_tunnel *t, struct net_device *dev, struct ip_tunnel_parm *p, bool set_mtu, __u32 fwmark) { ip_tunnel_del(itn, t); t->parms.iph.saddr = p->iph.saddr; t->parms.iph.daddr = p->iph.daddr; t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; if (dev->type != ARPHRD_ETHER) { __dev_addr_set(dev, &p->iph.saddr, 4); memcpy(dev->broadcast, &p->iph.daddr, 4); } ip_tunnel_add(itn, t); t->parms.iph.ttl = p->iph.ttl; t->parms.iph.tos = p->iph.tos; t->parms.iph.frag_off = p->iph.frag_off; if (t->parms.link != p->link || t->fwmark != fwmark) { int mtu; t->parms.link = p->link; t->fwmark = fwmark; mtu = ip_tunnel_bind_dev(dev); if (set_mtu) dev->mtu = mtu; } dst_cache_reset(&t->dst_cache); netdev_state_change(dev); } int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { int err = 0; struct ip_tunnel *t = netdev_priv(dev); struct net *net = t->net; struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); switch (cmd) { case SIOCGETTUNNEL: if (dev == itn->fb_tunnel_dev) { t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); if (!t) t = netdev_priv(dev); } memcpy(p, &t->parms, sizeof(*p)); break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; if (p->iph.ttl) p->iph.frag_off |= htons(IP_DF); if (!(p->i_flags & VTI_ISVTI)) { if (!(p->i_flags & TUNNEL_KEY)) p->i_key = 0; if (!(p->o_flags & TUNNEL_KEY)) p->o_key = 0; } t = ip_tunnel_find(itn, p, itn->type); if (cmd == SIOCADDTUNNEL) { if (!t) { t = ip_tunnel_create(net, itn, p); err = PTR_ERR_OR_ZERO(t); break; } err = -EEXIST; break; } if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { if (t) { if (t->dev != dev) { err = -EEXIST; break; } } else { unsigned int nflags = 0; if (ipv4_is_multicast(p->iph.daddr)) nflags = IFF_BROADCAST; else if (p->iph.daddr) nflags = IFF_POINTOPOINT; if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { err = -EINVAL; break; } t = netdev_priv(dev); } } if (t) { err = 0; ip_tunnel_update(itn, t, dev, p, true, 0); } else { err = -ENOENT; } break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; if (dev == itn->fb_tunnel_dev) { err = -ENOENT; t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); if (!t) goto done; err = -EPERM; if (t == netdev_priv(itn->fb_tunnel_dev)) goto done; dev = t->dev; } unregister_netdevice(dev); err = 0; break; default: err = -EINVAL; } done: return err; } EXPORT_SYMBOL_GPL(ip_tunnel_ctl); int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { struct ip_tunnel_parm p; int err; if (copy_from_user(&p, data, sizeof(p))) return -EFAULT; err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); if (!err && copy_to_user(data, &p, sizeof(p))) return -EFAULT; return err; } EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) { struct ip_tunnel *tunnel = netdev_priv(dev); int t_hlen = tunnel->hlen + sizeof(struct iphdr); int max_mtu = IP_MAX_MTU - t_hlen; if (dev->type == ARPHRD_ETHER) max_mtu -= dev->hard_header_len; if (new_mtu < ETH_MIN_MTU) return -EINVAL; if (new_mtu > max_mtu) { if (strict) return -EINVAL; new_mtu = max_mtu; } dev->mtu = new_mtu; return 0; } EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) { return __ip_tunnel_change_mtu(dev, new_mtu, true); } EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); static void ip_tunnel_dev_free(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); gro_cells_destroy(&tunnel->gro_cells); dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); } void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_net *itn; itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); if (itn->fb_tunnel_dev != dev) { ip_tunnel_del(itn, netdev_priv(dev)); unregister_netdevice_queue(dev, head); } } EXPORT_SYMBOL_GPL(ip_tunnel_dellink); struct net *ip_tunnel_get_link_net(const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); return tunnel->net; } EXPORT_SYMBOL(ip_tunnel_get_link_net); int ip_tunnel_get_iflink(const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); return tunnel->parms.link; } EXPORT_SYMBOL(ip_tunnel_get_iflink); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname) { struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); struct ip_tunnel_parm parms; unsigned int i; itn->rtnl_link_ops = ops; for (i = 0; i < IP_TNL_HASH_SIZE; i++) INIT_HLIST_HEAD(&itn->tunnels[i]); if (!ops || !net_has_fallback_tunnels(net)) { struct ip_tunnel_net *it_init_net; it_init_net = net_generic(&init_net, ip_tnl_net_id); itn->type = it_init_net->type; itn->fb_tunnel_dev = NULL; return 0; } memset(&parms, 0, sizeof(parms)); if (devname) strscpy(parms.name, devname, IFNAMSIZ); rtnl_lock(); itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ if (!IS_ERR(itn->fb_tunnel_dev)) { itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); itn->type = itn->fb_tunnel_dev->type; } rtnl_unlock(); return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, struct list_head *head, struct rtnl_link_ops *ops) { struct net_device *dev, *aux; int h; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == ops) unregister_netdevice_queue(dev, head); for (h = 0; h < IP_TNL_HASH_SIZE; h++) { struct ip_tunnel *t; struct hlist_node *n; struct hlist_head *thead = &itn->tunnels[h]; hlist_for_each_entry_safe(t, n, thead, hash_node) /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, head); } } void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, struct rtnl_link_ops *ops) { struct ip_tunnel_net *itn; struct net *net; LIST_HEAD(list); rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { itn = net_generic(net, id); ip_tunnel_destroy(net, itn, &list, ops); } unregister_netdevice_many(&list); rtnl_unlock(); } EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p, __u32 fwmark) { struct ip_tunnel *nt; struct net *net = dev_net(dev); struct ip_tunnel_net *itn; int mtu; int err; nt = netdev_priv(dev); itn = net_generic(net, nt->ip_tnl_net_id); if (nt->collect_md) { if (rtnl_dereference(itn->collect_md_tun)) return -EEXIST; } else { if (ip_tunnel_find(itn, p, dev->type)) return -EEXIST; } nt->net = net; nt->parms = *p; nt->fwmark = fwmark; err = register_netdevice(dev); if (err) goto err_register_netdevice; if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) eth_hw_addr_random(dev); mtu = ip_tunnel_bind_dev(dev); if (tb[IFLA_MTU]) { unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr)); if (dev->type == ARPHRD_ETHER) max -= dev->hard_header_len; mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max); } err = dev_set_mtu(dev, mtu); if (err) goto err_dev_set_mtu; ip_tunnel_add(itn, nt); return 0; err_dev_set_mtu: unregister_netdevice(dev); err_register_netdevice: return err; } EXPORT_SYMBOL_GPL(ip_tunnel_newlink); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p, __u32 fwmark) { struct ip_tunnel *t; struct ip_tunnel *tunnel = netdev_priv(dev); struct net *net = tunnel->net; struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); if (dev == itn->fb_tunnel_dev) return -EINVAL; t = ip_tunnel_find(itn, p, dev->type); if (t) { if (t->dev != dev) return -EEXIST; } else { t = tunnel; if (dev->type != ARPHRD_ETHER) { unsigned int nflags = 0; if (ipv4_is_multicast(p->iph.daddr)) nflags = IFF_BROADCAST; else if (p->iph.daddr) nflags = IFF_POINTOPOINT; if ((dev->flags ^ nflags) & (IFF_POINTOPOINT | IFF_BROADCAST)) return -EINVAL; } } ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark); return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_changelink); int ip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; int err; dev->needs_free_netdev = true; dev->priv_destructor = ip_tunnel_dev_free; dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (err) { free_percpu(dev->tstats); return err; } err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); return err; } tunnel->dev = dev; tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); iph->version = 4; iph->ihl = 5; if (tunnel->collect_md) netif_keep_dst(dev); return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_init); void ip_tunnel_uninit(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct net *net = tunnel->net; struct ip_tunnel_net *itn; itn = net_generic(net, tunnel->ip_tnl_net_id); ip_tunnel_del(itn, netdev_priv(dev)); if (itn->fb_tunnel_dev == dev) WRITE_ONCE(itn->fb_tunnel_dev, NULL); dst_cache_reset(&tunnel->dst_cache); } EXPORT_SYMBOL_GPL(ip_tunnel_uninit); /* Do least required initialization, rest of init is done in tunnel_init call */ void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) { struct ip_tunnel *tunnel = netdev_priv(dev); tunnel->ip_tnl_net_id = net_id; } EXPORT_SYMBOL_GPL(ip_tunnel_setup); MODULE_LICENSE("GPL");
935 368 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_LWTUNNEL_H #define __NET_LWTUNNEL_H 1 #include <linux/lwtunnel.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/route.h> #define LWTUNNEL_HASH_BITS 7 #define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) /* lw tunnel state flags */ #define LWTUNNEL_STATE_OUTPUT_REDIRECT BIT(0) #define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1) #define LWTUNNEL_STATE_XMIT_REDIRECT BIT(2) /* LWTUNNEL_XMIT_CONTINUE should be distinguishable from dst_output return * values (NET_XMIT_xxx and NETDEV_TX_xxx in linux/netdevice.h) for safety. */ enum { LWTUNNEL_XMIT_DONE, LWTUNNEL_XMIT_CONTINUE = 0x100, }; struct lwtunnel_state { __u16 type; __u16 flags; __u16 headroom; atomic_t refcnt; int (*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*orig_input)(struct sk_buff *); struct rcu_head rcu; __u8 data[]; }; struct lwtunnel_encap_ops { int (*build_state)(struct net *net, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack); void (*destroy_state)(struct lwtunnel_state *lws); int (*output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*input)(struct sk_buff *skb); int (*fill_encap)(struct sk_buff *skb, struct lwtunnel_state *lwtstate); int (*get_encap_size)(struct lwtunnel_state *lwtstate); int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b); int (*xmit)(struct sk_buff *skb); struct module *owner; }; #ifdef CONFIG_LWTUNNEL DECLARE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); void lwtstate_free(struct lwtunnel_state *lws); static inline struct lwtunnel_state * lwtstate_get(struct lwtunnel_state *lws) { if (lws) atomic_inc(&lws->refcnt); return lws; } static inline void lwtstate_put(struct lwtunnel_state *lws) { if (!lws) return; if (atomic_dec_and_test(&lws->refcnt)) lwtstate_free(lws); } static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) { if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT)) return true; return false; } static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) { if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_INPUT_REDIRECT)) return true; return false; } static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) { if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_XMIT_REDIRECT)) return true; return false; } static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, unsigned int mtu) { if ((lwtunnel_xmit_redirect(lwtstate) || lwtunnel_output_redirect(lwtstate)) && lwtstate->headroom < mtu) return lwtstate->headroom; return 0; } int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, struct netlink_ext_ack *extack); int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack); int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, int encap_attr, int encap_type_attr); int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb); int lwtunnel_input(struct sk_buff *skb); int lwtunnel_xmit(struct sk_buff *skb); int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress); static inline void lwtunnel_set_redirect(struct dst_entry *dst) { if (lwtunnel_output_redirect(dst->lwtstate)) { dst->lwtstate->orig_output = dst->output; dst->output = lwtunnel_output; } if (lwtunnel_input_redirect(dst->lwtstate)) { dst->lwtstate->orig_input = dst->input; dst->input = lwtunnel_input; } } #else static inline void lwtstate_free(struct lwtunnel_state *lws) { } static inline struct lwtunnel_state * lwtstate_get(struct lwtunnel_state *lws) { return lws; } static inline void lwtstate_put(struct lwtunnel_state *lws) { } static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) { return false; } static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) { return false; } static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate) { return false; } static inline void lwtunnel_set_redirect(struct dst_entry *dst) { } static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate, unsigned int mtu) { return 0; } static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num) { return -EOPNOTSUPP; } static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num) { return -EOPNOTSUPP; } static inline int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel"); return -EOPNOTSUPP; } static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, struct netlink_ext_ack *extack) { /* return 0 since we are not walking attr looking for * RTA_ENCAP_TYPE attribute on nexthops. */ return 0; } static inline int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack) { return -EOPNOTSUPP; } static inline int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, int encap_attr, int encap_type_attr) { return 0; } static inline int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) { return 0; } static inline struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len) { return NULL; } static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) { return 0; } static inline int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return -EOPNOTSUPP; } static inline int lwtunnel_input(struct sk_buff *skb) { return -EOPNOTSUPP; } static inline int lwtunnel_xmit(struct sk_buff *skb) { return -EOPNOTSUPP; } #endif /* CONFIG_LWTUNNEL */ #define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type)) #endif /* __NET_LWTUNNEL_H */
513 513 513 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 // SPDX-License-Identifier: GPL-2.0-or-later #include <net/gro.h> #include <net/dst_metadata.h> #include <net/busy_poll.h> #include <trace/events/net.h> #define MAX_GRO_SKBS 8 /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) static DEFINE_SPINLOCK(offload_lock); struct list_head offload_base __read_mostly = LIST_HEAD_INIT(offload_base); /* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ int gro_normal_batch __read_mostly = 8; /** * dev_add_offload - register offload handlers * @po: protocol offload declaration * * Add protocol offload handlers to the networking stack. The passed * &proto_offload is linked into kernel lists and may not be freed until * it has been removed from the kernel lists. * * This call does not sleep therefore it can not * guarantee all CPU's that are in middle of receiving packets * will see the new offload handlers (until the next received packet). */ void dev_add_offload(struct packet_offload *po) { struct packet_offload *elem; spin_lock(&offload_lock); list_for_each_entry(elem, &offload_base, list) { if (po->priority < elem->priority) break; } list_add_rcu(&po->list, elem->list.prev); spin_unlock(&offload_lock); } EXPORT_SYMBOL(dev_add_offload); /** * __dev_remove_offload - remove offload handler * @po: packet offload declaration * * Remove a protocol offload handler that was previously added to the * kernel offload handlers by dev_add_offload(). The passed &offload_type * is removed from the kernel lists and can be freed or reused once this * function returns. * * The packet type might still be in use by receivers * and must not be freed until after all the CPU's have gone * through a quiescent state. */ static void __dev_remove_offload(struct packet_offload *po) { struct list_head *head = &offload_base; struct packet_offload *po1; spin_lock(&offload_lock); list_for_each_entry(po1, head, list) { if (po == po1) { list_del_rcu(&po->list); goto out; } } pr_warn("dev_remove_offload: %p not found\n", po); out: spin_unlock(&offload_lock); } /** * dev_remove_offload - remove packet offload handler * @po: packet offload declaration * * Remove a packet offload handler that was previously added to the kernel * offload handlers by dev_add_offload(). The passed &offload_type is * removed from the kernel lists and can be freed or reused once this * function returns. * * This call sleeps to guarantee that no CPU is looking at the packet * type after return. */ void dev_remove_offload(struct packet_offload *po) { __dev_remove_offload(po); synchronize_net(); } EXPORT_SYMBOL(dev_remove_offload); int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) { struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb); unsigned int offset = skb_gro_offset(skb); unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); unsigned int delta_truesize; unsigned int gro_max_size; unsigned int new_truesize; struct sk_buff *lp; int segs; /* Do not splice page pool based packets w/ non-page pool * packets. This can result in reference count issues as page * pool pages will not decrement the reference count and will * instead be immediately returned to the pool or have frag * count decremented. */ if (p->pp_recycle != skb->pp_recycle) return -ETOOMANYREFS; /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */ gro_max_size = p->protocol == htons(ETH_P_IPV6) ? READ_ONCE(p->dev->gro_max_size) : READ_ONCE(p->dev->gro_ipv4_max_size); if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) return -E2BIG; if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP || (p->protocol == htons(ETH_P_IPV6) && skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) || p->encapsulation) return -E2BIG; } segs = NAPI_GRO_CB(skb)->count; lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); if (headlen <= offset) { skb_frag_t *frag; skb_frag_t *frag2; int i = skbinfo->nr_frags; int nr_frags = pinfo->nr_frags + i; if (nr_frags > MAX_SKB_FRAGS) goto merge; offset -= headlen; pinfo->nr_frags = nr_frags; skbinfo->nr_frags = 0; frag = pinfo->frags + nr_frags; frag2 = skbinfo->frags + i; do { *--frag = *--frag2; } while (--i); skb_frag_off_add(frag, offset); skb_frag_size_sub(frag, offset); /* all fragments truesize : remove (head size + sk_buff) */ new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); delta_truesize = skb->truesize - new_truesize; skb->truesize = new_truesize; skb->len -= skb->data_len; skb->data_len = 0; NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE; goto done; } else if (skb->head_frag) { int nr_frags = pinfo->nr_frags; skb_frag_t *frag = pinfo->frags + nr_frags; struct page *page = virt_to_head_page(skb->head); unsigned int first_size = headlen - offset; unsigned int first_offset; if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS) goto merge; first_offset = skb->data - (unsigned char *)page_address(page) + offset; pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags; skb_frag_fill_page_desc(frag, page, first_offset, first_size); memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); /* We dont need to clear skbinfo->nr_frags here */ new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); delta_truesize = skb->truesize - new_truesize; skb->truesize = new_truesize; NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; } merge: /* sk owenrship - if any - completely transferred to the aggregated packet */ skb->destructor = NULL; delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; skb_frag_off_add(&skbinfo->frags[0], eat); skb_frag_size_sub(&skbinfo->frags[0], eat); skb->data_len -= eat; skb->len -= eat; offset = headlen; } __skb_pull(skb, offset); if (NAPI_GRO_CB(p)->last == p) skb_shinfo(p)->frag_list = skb; else NAPI_GRO_CB(p)->last->next = skb; NAPI_GRO_CB(p)->last = skb; __skb_header_release(skb); lp = p; done: NAPI_GRO_CB(p)->count += segs; p->data_len += len; p->truesize += delta_truesize; p->len += len; if (lp != p) { lp->data_len += len; lp->truesize += delta_truesize; lp->len += len; } NAPI_GRO_CB(skb)->same_flow = 1; return 0; } static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { struct packet_offload *ptype; __be16 type = skb->protocol; struct list_head *head = &offload_base; int err = -ENOENT; BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); if (NAPI_GRO_CB(skb)->count == 1) { skb_shinfo(skb)->gso_size = 0; goto out; } rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type != type || !ptype->callbacks.gro_complete) continue; err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete, ipv6_gro_complete, inet_gro_complete, skb, 0); break; } rcu_read_unlock(); if (err) { WARN_ON(&ptype->list == head); kfree_skb(skb); return; } out: gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); } static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, bool flush_old) { struct list_head *head = &napi->gro_hash[index].list; struct sk_buff *skb, *p; list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; skb_list_del_init(skb); napi_gro_complete(napi, skb); napi->gro_hash[index].count--; } if (!napi->gro_hash[index].count) __clear_bit(index, &napi->gro_bitmask); } /* napi->gro_hash[].list contains packets ordered by age. * youngest packets at the head of it. * Complete skbs in reverse order to reduce latencies. */ void napi_gro_flush(struct napi_struct *napi, bool flush_old) { unsigned long bitmask = napi->gro_bitmask; unsigned int i, base = ~0U; while ((i = ffs(bitmask)) != 0) { bitmask >>= i; base += i; __napi_gro_flush_chain(napi, base, flush_old); } } EXPORT_SYMBOL(napi_gro_flush); static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb, const struct sk_buff *p, unsigned long diffs) { #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) struct tc_skb_ext *skb_ext; struct tc_skb_ext *p_ext; skb_ext = skb_ext_find(skb, TC_SKB_EXT); p_ext = skb_ext_find(p, TC_SKB_EXT); diffs |= (!!p_ext) ^ (!!skb_ext); if (!diffs && unlikely(skb_ext)) diffs |= p_ext->chain ^ skb_ext->chain; #endif return diffs; } static void gro_list_prepare(const struct list_head *head, const struct sk_buff *skb) { unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); struct sk_buff *p; list_for_each_entry(p, head, list) { unsigned long diffs; NAPI_GRO_CB(p)->flush = 0; if (hash != skb_get_hash_raw(p)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_all ^ skb->vlan_all; diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), skb_mac_header(skb)); else if (!diffs) diffs = memcmp(skb_mac_header(p), skb_mac_header(skb), maclen); /* in most common scenarions 'slow_gro' is 0 * otherwise we are already on some slower paths * either skip all the infrequent tests altogether or * avoid trying too hard to skip each of them individually */ if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { diffs |= p->sk != skb->sk; diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); diffs |= gro_list_prepare_tc_ext(skb, p, diffs); } NAPI_GRO_CB(p)->same_flow = !diffs; } } static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) { const struct skb_shared_info *pinfo = skb_shinfo(skb); const skb_frag_t *frag0 = &pinfo->frags[0]; NAPI_GRO_CB(skb)->data_offset = 0; NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; if (!skb_headlen(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) && (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, skb_frag_size(frag0), skb->end - skb->tail); } } static void gro_pull_from_frag0(struct sk_buff *skb, int grow) { struct skb_shared_info *pinfo = skb_shinfo(skb); BUG_ON(skb->end - skb->tail < grow); memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); skb->data_len -= grow; skb->tail += grow; skb_frag_off_add(&pinfo->frags[0], grow); skb_frag_size_sub(&pinfo->frags[0], grow); if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { skb_frag_unref(skb, 0); memmove(pinfo->frags, pinfo->frags + 1, --pinfo->nr_frags * sizeof(pinfo->frags[0])); } } static void gro_try_pull_from_frag0(struct sk_buff *skb) { int grow = skb_gro_offset(skb) - skb_headlen(skb); if (grow > 0) gro_pull_from_frag0(skb, grow); } static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) { struct sk_buff *oldest; oldest = list_last_entry(head, struct sk_buff, list); /* We are called with head length >= MAX_GRO_SKBS, so this is * impossible. */ if (WARN_ON_ONCE(!oldest)) return; /* Do not adjust napi->gro_hash[].count, caller is adding a new * SKB to the chain. */ skb_list_del_init(oldest); napi_gro_complete(napi, oldest); } static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); struct gro_list *gro_list = &napi->gro_hash[bucket]; struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; struct sk_buff *pp = NULL; enum gro_result ret; int same_flow; if (netif_elide_gro(skb->dev)) goto normal; gro_list_prepare(&gro_list->list, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { if (ptype->type == type && ptype->callbacks.gro_receive) goto found_ptype; } rcu_read_unlock(); goto normal; found_ptype: skb_set_network_header(skb, skb_gro_offset(skb)); skb_reset_mac_len(skb); BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32)); BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->count = 1; if (unlikely(skb_is_gso(skb))) { NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; /* Only support TCP and non DODGY users. */ if (!skb_is_gso_tcp(skb) || (skb_shinfo(skb)->gso_type & SKB_GSO_DODGY)) NAPI_GRO_CB(skb)->flush = 1; } /* Setup for GRO checksum validation */ switch (skb->ip_summed) { case CHECKSUM_COMPLETE: NAPI_GRO_CB(skb)->csum = skb->csum; NAPI_GRO_CB(skb)->csum_valid = 1; break; case CHECKSUM_UNNECESSARY: NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; break; } pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, ipv6_gro_receive, inet_gro_receive, &gro_list->list, skb); rcu_read_unlock(); if (PTR_ERR(pp) == -EINPROGRESS) { ret = GRO_CONSUMED; goto ok; } same_flow = NAPI_GRO_CB(skb)->same_flow; ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; if (pp) { skb_list_del_init(pp); napi_gro_complete(napi, pp); gro_list->count--; } if (same_flow) goto ok; if (NAPI_GRO_CB(skb)->flush) goto normal; if (unlikely(gro_list->count >= MAX_GRO_SKBS)) gro_flush_oldest(napi, &gro_list->list); else gro_list->count++; /* Must be called before setting NAPI_GRO_CB(skb)->{age|last} */ gro_try_pull_from_frag0(skb); NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; if (!skb_is_gso(skb)) skb_shinfo(skb)->gso_size = skb_gro_len(skb); list_add(&skb->list, &gro_list->list); ret = GRO_HELD; ok: if (gro_list->count) { if (!test_bit(bucket, &napi->gro_bitmask)) __set_bit(bucket, &napi->gro_bitmask); } else if (test_bit(bucket, &napi->gro_bitmask)) { __clear_bit(bucket, &napi->gro_bitmask); } return ret; normal: ret = GRO_NORMAL; gro_try_pull_from_frag0(skb); goto ok; } struct packet_offload *gro_find_receive_by_type(__be16 type) { struct list_head *offload_head = &offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { if (ptype->type != type || !ptype->callbacks.gro_receive) continue; return ptype; } return NULL; } EXPORT_SYMBOL(gro_find_receive_by_type); struct packet_offload *gro_find_complete_by_type(__be16 type) { struct list_head *offload_head = &offload_base; struct packet_offload *ptype; list_for_each_entry_rcu(ptype, offload_head, list) { if (ptype->type != type || !ptype->callbacks.gro_complete) continue; return ptype; } return NULL; } EXPORT_SYMBOL(gro_find_complete_by_type); static gro_result_t napi_skb_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) { switch (ret) { case GRO_NORMAL: gro_normal_one(napi, skb, 1); break; case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); else if (skb->fclone != SKB_FCLONE_UNAVAILABLE) __kfree_skb(skb); else __napi_kfree_skb(skb, SKB_CONSUMED); break; case GRO_HELD: case GRO_MERGED: case GRO_CONSUMED: break; } return ret; } gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { gro_result_t ret; skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb, 0); ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); return ret; } EXPORT_SYMBOL(napi_gro_receive); static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { if (unlikely(skb->pfmemalloc)) { consume_skb(skb); return; } __skb_pull(skb, skb_headlen(skb)); /* restore the reserve we had after netdev_alloc_skb_ip_align() */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); __vlan_hwaccel_clear_tag(skb); skb->dev = napi->dev; skb->skb_iif = 0; /* eth_type_trans() assumes pkt_type is PACKET_HOST */ skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb_shinfo(skb)->gso_size = 0; if (unlikely(skb->slow_gro)) { skb_orphan(skb); skb_ext_reset(skb); nf_reset_ct(skb); skb->slow_gro = 0; } napi->skb = skb; } struct sk_buff *napi_get_frags(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; if (!skb) { skb = napi_alloc_skb(napi, GRO_MAX_HEAD); if (skb) { napi->skb = skb; skb_mark_napi_id(skb, napi); } } return skb; } EXPORT_SYMBOL(napi_get_frags); static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) { switch (ret) { case GRO_NORMAL: case GRO_HELD: __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_NORMAL) gro_normal_one(napi, skb, 1); break; case GRO_MERGED_FREE: if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) napi_skb_free_stolen_head(skb); else napi_reuse_skb(napi, skb); break; case GRO_MERGED: case GRO_CONSUMED: break; } return ret; } /* Upper GRO stack assumes network header starts at gro_offset=0 * Drivers could call both napi_gro_frags() and napi_gro_receive() * We copy ethernet header into skb->data to have a common layout. */ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) { struct sk_buff *skb = napi->skb; const struct ethhdr *eth; unsigned int hlen = sizeof(*eth); napi->skb = NULL; skb_reset_mac_header(skb); skb_gro_reset_offset(skb, hlen); if (unlikely(skb_gro_header_hard(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); if (unlikely(!eth)) { net_warn_ratelimited("%s: dropping impossible skb from %s\n", __func__, napi->dev->name); napi_reuse_skb(napi, skb); return NULL; } } else { eth = (const struct ethhdr *)skb->data; gro_pull_from_frag0(skb, hlen); NAPI_GRO_CB(skb)->frag0 += hlen; NAPI_GRO_CB(skb)->frag0_len -= hlen; } __skb_pull(skb, hlen); /* * This works because the only protocols we care about don't require * special handling. * We'll fix it up properly in napi_frags_finish() */ skb->protocol = eth->h_proto; return skb; } gro_result_t napi_gro_frags(struct napi_struct *napi) { gro_result_t ret; struct sk_buff *skb = napi_frags_skb(napi); trace_napi_gro_frags_entry(skb); ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_frags_exit(ret); return ret; } EXPORT_SYMBOL(napi_gro_frags); /* Compute the checksum from gro_offset and return the folded value * after adding in any pseudo checksum. */ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) { __wsum wsum; __sum16 sum; wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); /* See comments in __skb_checksum_complete(). */ if (likely(!sum)) { if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && !skb->csum_complete_sw) netdev_rx_csum_fault(skb->dev, skb); } NAPI_GRO_CB(skb)->csum = wsum; NAPI_GRO_CB(skb)->csum_valid = 1; return sum; } EXPORT_SYMBOL(__skb_gro_checksum_complete);
7 93 67 156 192 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tcp #if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TCP_H #include <linux/ipv6.h> #include <linux/tcp.h> #include <linux/tracepoint.h> #include <net/ipv6.h> #include <net/tcp.h> #include <linux/sock_diag.h> #define TP_STORE_V4MAPPED(__entry, saddr, daddr) \ do { \ struct in6_addr *pin6; \ \ pin6 = (struct in6_addr *)__entry->saddr_v6; \ ipv6_addr_set_v4mapped(saddr, pin6); \ pin6 = (struct in6_addr *)__entry->daddr_v6; \ ipv6_addr_set_v4mapped(daddr, pin6); \ } while (0) #if IS_ENABLED(CONFIG_IPV6) #define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ do { \ if (sk->sk_family == AF_INET6) { \ struct in6_addr *pin6; \ \ pin6 = (struct in6_addr *)__entry->saddr_v6; \ *pin6 = saddr6; \ pin6 = (struct in6_addr *)__entry->daddr_v6; \ *pin6 = daddr6; \ } else { \ TP_STORE_V4MAPPED(__entry, saddr, daddr); \ } \ } while (0) #else #define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6) \ TP_STORE_V4MAPPED(__entry, saddr, daddr) #endif /* * tcp event with arguments sk and skb * * Note: this class requires a valid sk pointer; while skb pointer could * be NULL. */ DECLARE_EVENT_CLASS(tcp_event_sk_skb, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( __field(const void *, skbaddr) __field(const void *, skaddr) __field(int, state) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( const struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skbaddr = skb; __entry->skaddr = sk; __entry->state = sk->sk_state; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c state=%s", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, show_tcp_state_name(__entry->state)) ); DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); /* * skb of trace_tcp_send_reset is the skb that caused RST. In case of * active reset, skb should be NULL */ DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset, TP_PROTO(const struct sock *sk, const struct sk_buff *skb), TP_ARGS(sk, skb) ); /* * tcp event with arguments sk * * Note: this class requires a valid sk pointer. */ DECLARE_EVENT_CLASS(tcp_event_sk, TP_PROTO(struct sock *sk), TP_ARGS(sk), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) __field(__u64, sock_cookie) ), TP_fast_assign( struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); __entry->sock_cookie = sock_gen_cookie(sk); ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c sock_cookie=%llx", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->sock_cookie) ); DEFINE_EVENT(tcp_event_sk, tcp_receive_reset, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); DEFINE_EVENT(tcp_event_sk, tcp_destroy_sock, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); DEFINE_EVENT(tcp_event_sk, tcp_rcv_space_adjust, TP_PROTO(struct sock *sk), TP_ARGS(sk) ); TRACE_EVENT(tcp_retransmit_synack, TP_PROTO(const struct sock *sk, const struct request_sock *req), TP_ARGS(sk, req), TP_STRUCT__entry( __field(const void *, skaddr) __field(const void *, req) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) ), TP_fast_assign( struct inet_request_sock *ireq = inet_rsk(req); __be32 *p32; __entry->skaddr = sk; __entry->req = req; __entry->sport = ireq->ir_num; __entry->dport = ntohs(ireq->ir_rmt_port); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = ireq->ir_loc_addr; p32 = (__be32 *) __entry->daddr; *p32 = ireq->ir_rmt_addr; TP_STORE_ADDRS(__entry, ireq->ir_loc_addr, ireq->ir_rmt_addr, ireq->ir_v6_loc_addr, ireq->ir_v6_rmt_addr); ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6) ); #include <trace/events/net_probe_common.h> TRACE_EVENT(tcp_probe, TP_PROTO(struct sock *sk, struct sk_buff *skb), TP_ARGS(sk, skb), TP_STRUCT__entry( /* sockaddr_in6 is always bigger than sockaddr_in */ __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __field(__u32, mark) __field(__u16, data_len) __field(__u32, snd_nxt) __field(__u32, snd_una) __field(__u32, snd_cwnd) __field(__u32, ssthresh) __field(__u32, snd_wnd) __field(__u32, srtt) __field(__u32, rcv_wnd) __field(__u64, sock_cookie) ), TP_fast_assign( const struct tcphdr *th = (const struct tcphdr *)skb->data; const struct inet_sock *inet = inet_sk(sk); const struct tcp_sock *tp = tcp_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS(__entry, inet, sk); /* For filtering use */ __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->mark = skb->mark; __entry->family = sk->sk_family; __entry->data_len = skb->len - __tcp_hdrlen(th); __entry->snd_nxt = tp->snd_nxt; __entry->snd_una = tp->snd_una; __entry->snd_cwnd = tcp_snd_cwnd(tp); __entry->snd_wnd = tp->snd_wnd; __entry->rcv_wnd = tp->rcv_wnd; __entry->ssthresh = tcp_current_ssthresh(sk); __entry->srtt = tp->srtt_us >> 3; __entry->sock_cookie = sock_gen_cookie(sk); ), TP_printk("family=%s src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u sock_cookie=%llx", show_family_name(__entry->family), __entry->saddr, __entry->daddr, __entry->mark, __entry->data_len, __entry->snd_nxt, __entry->snd_una, __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd, __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie) ); #define TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb) \ do { \ const struct tcphdr *th = (const struct tcphdr *)skb->data; \ struct sockaddr_in *v4 = (void *)__entry->saddr; \ \ v4->sin_family = AF_INET; \ v4->sin_port = th->source; \ v4->sin_addr.s_addr = ip_hdr(skb)->saddr; \ v4 = (void *)__entry->daddr; \ v4->sin_family = AF_INET; \ v4->sin_port = th->dest; \ v4->sin_addr.s_addr = ip_hdr(skb)->daddr; \ } while (0) #if IS_ENABLED(CONFIG_IPV6) #define TP_STORE_ADDR_PORTS_SKB(__entry, skb) \ do { \ const struct iphdr *iph = ip_hdr(skb); \ \ if (iph->version == 6) { \ const struct tcphdr *th = (const struct tcphdr *)skb->data; \ struct sockaddr_in6 *v6 = (void *)__entry->saddr; \ \ v6->sin6_family = AF_INET6; \ v6->sin6_port = th->source; \ v6->sin6_addr = ipv6_hdr(skb)->saddr; \ v6 = (void *)__entry->daddr; \ v6->sin6_family = AF_INET6; \ v6->sin6_port = th->dest; \ v6->sin6_addr = ipv6_hdr(skb)->daddr; \ } else \ TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb); \ } while (0) #else #define TP_STORE_ADDR_PORTS_SKB(__entry, skb) \ TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb) #endif /* * tcp event with only skb */ DECLARE_EVENT_CLASS(tcp_event_skb, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __field(const void *, skbaddr) __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( __entry->skbaddr = skb; memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS_SKB(__entry, skb); ), TP_printk("src=%pISpc dest=%pISpc", __entry->saddr, __entry->daddr) ); DEFINE_EVENT(tcp_event_skb, tcp_bad_csum, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); TRACE_EVENT(tcp_cong_state_set, TP_PROTO(struct sock *sk, const u8 ca_state), TP_ARGS(sk, ca_state), TP_STRUCT__entry( __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) __array(__u8, daddr_v6, 16) __field(__u8, cong_state) ), TP_fast_assign( struct inet_sock *inet = inet_sk(sk); __be32 *p32; __entry->skaddr = sk; __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; p32 = (__be32 *) __entry->daddr; *p32 = inet->inet_daddr; TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr, sk->sk_v6_rcv_saddr, sk->sk_v6_daddr); __entry->cong_state = ca_state; ), TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u", show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, __entry->cong_state) ); #endif /* _TRACE_TCP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1179 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 // SPDX-License-Identifier: GPL-2.0 /* * USB-ACPI glue code * * Copyright 2012 Red Hat <mjg@redhat.com> */ #include <linux/module.h> #include <linux/usb.h> #include <linux/device.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/acpi.h> #include <linux/pci.h> #include <linux/usb/hcd.h> #include "hub.h" /** * usb_acpi_power_manageable - check whether usb port has * acpi power resource. * @hdev: USB device belonging to the usb hub * @index: port index based zero * * Return true if the port has acpi power resource and false if no. */ bool usb_acpi_power_manageable(struct usb_device *hdev, int index) { acpi_handle port_handle; int port1 = index + 1; port_handle = usb_get_hub_port_acpi_handle(hdev, port1); if (port_handle) return acpi_bus_power_manageable(port_handle); else return false; } EXPORT_SYMBOL_GPL(usb_acpi_power_manageable); #define UUID_USB_CONTROLLER_DSM "ce2ee385-00e6-48cb-9f05-2edb927c4899" #define USB_DSM_DISABLE_U1_U2_FOR_PORT 5 /** * usb_acpi_port_lpm_incapable - check if lpm should be disabled for a port. * @hdev: USB device belonging to the usb hub * @index: zero based port index * * Some USB3 ports may not support USB3 link power management U1/U2 states * due to different retimer setup. ACPI provides _DSM method which returns 0x01 * if U1 and U2 states should be disabled. Evaluate _DSM with: * Arg0: UUID = ce2ee385-00e6-48cb-9f05-2edb927c4899 * Arg1: Revision ID = 0 * Arg2: Function Index = 5 * Arg3: (empty) * * Return 1 if USB3 port is LPM incapable, negative on error, otherwise 0 */ int usb_acpi_port_lpm_incapable(struct usb_device *hdev, int index) { union acpi_object *obj; acpi_handle port_handle; int port1 = index + 1; guid_t guid; int ret; ret = guid_parse(UUID_USB_CONTROLLER_DSM, &guid); if (ret) return ret; port_handle = usb_get_hub_port_acpi_handle(hdev, port1); if (!port_handle) { dev_dbg(&hdev->dev, "port-%d no acpi handle\n", port1); return -ENODEV; } if (!acpi_check_dsm(port_handle, &guid, 0, BIT(USB_DSM_DISABLE_U1_U2_FOR_PORT))) { dev_dbg(&hdev->dev, "port-%d no _DSM function %d\n", port1, USB_DSM_DISABLE_U1_U2_FOR_PORT); return -ENODEV; } obj = acpi_evaluate_dsm_typed(port_handle, &guid, 0, USB_DSM_DISABLE_U1_U2_FOR_PORT, NULL, ACPI_TYPE_INTEGER); if (!obj) { dev_dbg(&hdev->dev, "evaluate port-%d _DSM failed\n", port1); return -EINVAL; } if (obj->integer.value == 0x01) ret = 1; ACPI_FREE(obj); return ret; } EXPORT_SYMBOL_GPL(usb_acpi_port_lpm_incapable); /** * usb_acpi_set_power_state - control usb port's power via acpi power * resource * @hdev: USB device belonging to the usb hub * @index: port index based zero * @enable: power state expected to be set * * Notice to use usb_acpi_power_manageable() to check whether the usb port * has acpi power resource before invoking this function. * * Returns 0 on success, else negative errno. */ int usb_acpi_set_power_state(struct usb_device *hdev, int index, bool enable) { struct usb_hub *hub = usb_hub_to_struct_hub(hdev); struct usb_port *port_dev; acpi_handle port_handle; unsigned char state; int port1 = index + 1; int error = -EINVAL; if (!hub) return -ENODEV; port_dev = hub->ports[port1 - 1]; port_handle = (acpi_handle) usb_get_hub_port_acpi_handle(hdev, port1); if (!port_handle) return error; if (enable) state = ACPI_STATE_D0; else state = ACPI_STATE_D3_COLD; error = acpi_bus_set_power(port_handle, state); if (!error) dev_dbg(&port_dev->dev, "acpi: power was set to %d\n", enable); else dev_dbg(&port_dev->dev, "acpi: power failed to be set\n"); return error; } EXPORT_SYMBOL_GPL(usb_acpi_set_power_state); static enum usb_port_connect_type usb_acpi_get_connect_type(acpi_handle handle, struct acpi_pld_info *pld) { enum usb_port_connect_type connect_type = USB_PORT_CONNECT_TYPE_UNKNOWN; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *upc = NULL; acpi_status status; /* * According to 9.14 in ACPI Spec 6.2. _PLD indicates whether usb port * is user visible and _UPC indicates whether it is connectable. If * the port was visible and connectable, it could be freely connected * and disconnected with USB devices. If no visible and connectable, * a usb device is directly hard-wired to the port. If no visible and * no connectable, the port would be not used. */ status = acpi_evaluate_object(handle, "_UPC", NULL, &buffer); if (ACPI_FAILURE(status)) goto out; upc = buffer.pointer; if (!upc || (upc->type != ACPI_TYPE_PACKAGE) || upc->package.count != 4) goto out; if (upc->package.elements[0].integer.value) if (pld->user_visible) connect_type = USB_PORT_CONNECT_TYPE_HOT_PLUG; else connect_type = USB_PORT_CONNECT_TYPE_HARD_WIRED; else if (!pld->user_visible) connect_type = USB_PORT_NOT_USED; out: kfree(upc); return connect_type; } /* * Private to usb-acpi, all the core needs to know is that * port_dev->location is non-zero when it has been set by the firmware. */ #define USB_ACPI_LOCATION_VALID (1 << 31) static struct acpi_device * usb_acpi_get_companion_for_port(struct usb_port *port_dev) { struct usb_device *udev; struct acpi_device *adev; acpi_handle *parent_handle; int port1; /* Get the struct usb_device point of port's hub */ udev = to_usb_device(port_dev->dev.parent->parent); /* * The root hub ports' parent is the root hub. The non-root-hub * ports' parent is the parent hub port which the hub is * connected to. */ if (!udev->parent) { adev = ACPI_COMPANION(&udev->dev); port1 = usb_hcd_find_raw_port_number(bus_to_hcd(udev->bus), port_dev->portnum); } else { parent_handle = usb_get_hub_port_acpi_handle(udev->parent, udev->portnum); if (!parent_handle) return NULL; adev = acpi_fetch_acpi_dev(parent_handle); port1 = port_dev->portnum; } return acpi_find_child_by_adr(adev, port1); } static struct acpi_device * usb_acpi_find_companion_for_port(struct usb_port *port_dev) { struct acpi_device *adev; struct acpi_pld_info *pld; acpi_handle *handle; acpi_status status; adev = usb_acpi_get_companion_for_port(port_dev); if (!adev) return NULL; handle = adev->handle; status = acpi_get_physical_device_location(handle, &pld); if (ACPI_SUCCESS(status) && pld) { port_dev->location = USB_ACPI_LOCATION_VALID | pld->group_token << 8 | pld->group_position; port_dev->connect_type = usb_acpi_get_connect_type(handle, pld); ACPI_FREE(pld); } return adev; } static struct acpi_device * usb_acpi_find_companion_for_device(struct usb_device *udev) { struct acpi_device *adev; struct usb_port *port_dev; struct usb_hub *hub; if (!udev->parent) { /* * root hub is only child (_ADR=0) under its parent, the HC. * sysdev pointer is the HC as seen from firmware. */ adev = ACPI_COMPANION(udev->bus->sysdev); return acpi_find_child_device(adev, 0, false); } hub = usb_hub_to_struct_hub(udev->parent); if (!hub) return NULL; /* * This is an embedded USB device connected to a port and such * devices share port's ACPI companion. */ port_dev = hub->ports[udev->portnum - 1]; return usb_acpi_get_companion_for_port(port_dev); } static struct acpi_device *usb_acpi_find_companion(struct device *dev) { /* * The USB hierarchy like following: * * Device (EHC1) * Device (HUBN) * Device (PR01) * Device (PR11) * Device (PR12) * Device (FN12) * Device (FN13) * Device (PR13) * ... * where HUBN is root hub, and PRNN are USB ports and devices * connected to them, and FNNN are individualk functions for * connected composite USB devices. PRNN and FNNN may contain * _CRS and other methods describing sideband resources for * the connected device. * * On the kernel side both root hub and embedded USB devices are * represented as instances of usb_device structure, and ports * are represented as usb_port structures, so the whole process * is split into 2 parts: finding companions for devices and * finding companions for ports. * * Note that we do not handle individual functions of composite * devices yet, for that we would need to assign companions to * devices corresponding to USB interfaces. */ if (is_usb_device(dev)) return usb_acpi_find_companion_for_device(to_usb_device(dev)); else if (is_usb_port(dev)) return usb_acpi_find_companion_for_port(to_usb_port(dev)); return NULL; } static bool usb_acpi_bus_match(struct device *dev) { return is_usb_device(dev) || is_usb_port(dev); } static struct acpi_bus_type usb_acpi_bus = { .name = "USB", .match = usb_acpi_bus_match, .find_companion = usb_acpi_find_companion, }; int usb_acpi_register(void) { return register_acpi_bus_type(&usb_acpi_bus); } void usb_acpi_unregister(void) { unregister_acpi_bus_type(&usb_acpi_bus); }
1755 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 // SPDX-License-Identifier: GPL-2.0-or-later /* * X.25 Packet Layer release 002 * * This is ALPHA test software. This code may break your machine, * randomly fail to work with new releases, misbehave and/or generally * screw up. It might even work. * * This code REQUIRES 2.1.15 or higher * * History * X.25 001 Jonathan Naylor Started coding. * X.25 002 Jonathan Naylor Centralised disconnect handling. * New timer architecture. * 2000-03-11 Henner Eisen MSG_EOR handling more POSIX compliant. * 2000-03-22 Daniela Squassoni Allowed disabling/enabling of * facilities negotiation and increased * the throughput upper limit. * 2000-08-27 Arnaldo C. Melo s/suser/capable/ + micro cleanups * 2000-09-04 Henner Eisen Set sock->state in x25_accept(). * Fixed x25_output() related skb leakage. * 2000-10-02 Henner Eisen Made x25_kick() single threaded per socket. * 2000-10-27 Henner Eisen MSG_DONTWAIT for fragment allocation. * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to * x25_proc.c, using seq_file * 2005-04-02 Shaun Pereira Selective sub address matching * with call user data * 2005-04-15 Shaun Pereira Fast select with no restriction on * response */ #define pr_fmt(fmt) "X25: " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/notifier.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ctype.h> #include <net/x25.h> #include <net/compat.h> int sysctl_x25_restart_request_timeout = X25_DEFAULT_T20; int sysctl_x25_call_request_timeout = X25_DEFAULT_T21; int sysctl_x25_reset_request_timeout = X25_DEFAULT_T22; int sysctl_x25_clear_request_timeout = X25_DEFAULT_T23; int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2; int sysctl_x25_forward = 0; HLIST_HEAD(x25_list); DEFINE_RWLOCK(x25_list_lock); static const struct proto_ops x25_proto_ops; static const struct x25_address null_x25_address = {" "}; #ifdef CONFIG_COMPAT struct compat_x25_subscrip_struct { char device[200-sizeof(compat_ulong_t)]; compat_ulong_t global_facil_mask; compat_uint_t extended; }; #endif int x25_parse_address_block(struct sk_buff *skb, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned char len; int needed; int rc; if (!pskb_may_pull(skb, 1)) { /* packet has no address block */ rc = 0; goto empty; } len = *skb->data; needed = 1 + ((len >> 4) + (len & 0x0f) + 1) / 2; if (!pskb_may_pull(skb, needed)) { /* packet is too short to hold the addresses it claims to hold */ rc = -1; goto empty; } return x25_addr_ntoa(skb->data, called_addr, calling_addr); empty: *called_addr->x25_addr = 0; *calling_addr->x25_addr = 0; return rc; } int x25_addr_ntoa(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; unsigned int i; called_len = (*p >> 0) & 0x0F; calling_len = (*p >> 4) & 0x0F; called = called_addr->x25_addr; calling = calling_addr->x25_addr; p++; for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *called++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *called++ = ((*p >> 4) & 0x0F) + '0'; } } else { if (i % 2 != 0) { *calling++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *calling++ = ((*p >> 4) & 0x0F) + '0'; } } } *called = *calling = '\0'; return 1 + (called_len + calling_len + 1) / 2; } int x25_addr_aton(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; int i; called = called_addr->x25_addr; calling = calling_addr->x25_addr; called_len = strlen(called); calling_len = strlen(calling); *p++ = (calling_len << 4) | (called_len << 0); for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *p |= (*called++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*called++ - '0') << 4; } } else { if (i % 2 != 0) { *p |= (*calling++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*calling++ - '0') << 4; } } } return 1 + (called_len + calling_len + 1) / 2; } /* * Socket removal during an interrupt is now safe. */ static void x25_remove_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_del_node_init(sk); write_unlock_bh(&x25_list_lock); } /* * Handle device status changes. */ static int x25_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct x25_neigh *nb; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (dev->type == ARPHRD_X25) { switch (event) { case NETDEV_REGISTER: case NETDEV_POST_TYPE_CHANGE: x25_link_device_up(dev); break; case NETDEV_DOWN: nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } x25_route_device_down(dev); break; case NETDEV_PRE_TYPE_CHANGE: case NETDEV_UNREGISTER: x25_link_device_down(dev); break; case NETDEV_CHANGE: if (!netif_carrier_ok(dev)) { nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } } break; } } return NOTIFY_DONE; } /* * Add a socket to the bound sockets list. */ static void x25_insert_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_add_node(sk, &x25_list); write_unlock_bh(&x25_list_lock); } /* * Find a socket that wants to accept the Call Request we just * received. Check the full list for an address/cud match. * If no cuds match return the next_best thing, an address match. * Note: if a listening socket has cud set it must only get calls * with matching cud. */ static struct sock *x25_find_listener(struct x25_address *addr, struct sk_buff *skb) { struct sock *s; struct sock *next_best; read_lock_bh(&x25_list_lock); next_best = NULL; sk_for_each(s, &x25_list) if ((!strcmp(addr->x25_addr, x25_sk(s)->source_addr.x25_addr) || !strcmp(x25_sk(s)->source_addr.x25_addr, null_x25_address.x25_addr)) && s->sk_state == TCP_LISTEN) { /* * Found a listening socket, now check the incoming * call user data vs this sockets call user data */ if (x25_sk(s)->cudmatchlength > 0 && skb->len >= x25_sk(s)->cudmatchlength) { if((memcmp(x25_sk(s)->calluserdata.cuddata, skb->data, x25_sk(s)->cudmatchlength)) == 0) { sock_hold(s); goto found; } } else next_best = s; } if (next_best) { s = next_best; sock_hold(s); goto found; } s = NULL; found: read_unlock_bh(&x25_list_lock); return s; } /* * Find a connected X.25 socket given my LCI and neighbour. */ static struct sock *__x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; sk_for_each(s, &x25_list) if (x25_sk(s)->lci == lci && x25_sk(s)->neighbour == nb) { sock_hold(s); goto found; } s = NULL; found: return s; } struct sock *x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; read_lock_bh(&x25_list_lock); s = __x25_find_socket(lci, nb); read_unlock_bh(&x25_list_lock); return s; } /* * Find a unique LCI for a given device. */ static unsigned int x25_new_lci(struct x25_neigh *nb) { unsigned int lci = 1; struct sock *sk; while ((sk = x25_find_socket(lci, nb)) != NULL) { sock_put(sk); if (++lci == 4096) { lci = 0; break; } cond_resched(); } return lci; } /* * Deferred destroy. */ static void __x25_destroy_socket(struct sock *); /* * handler for deferred kills. */ static void x25_destroy_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); x25_destroy_socket_from_timer(sk); } /* * This is called from user mode and the timers. Thus it protects itself * against interrupting users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. * Not static as it's used by the timer */ static void __x25_destroy_socket(struct sock *sk) { struct sk_buff *skb; x25_stop_heartbeat(sk); x25_stop_timer(sk); x25_remove_socket(sk); x25_clear_queues(sk); /* Flush the queues */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (skb->sk != sk) { /* A pending connection */ /* * Queue the unaccepted socket for death */ skb->sk->sk_state = TCP_LISTEN; sock_set_flag(skb->sk, SOCK_DEAD); x25_start_heartbeat(skb->sk); x25_sk(skb->sk)->state = X25_STATE_0; } kfree_skb(skb); } if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.expires = jiffies + 10 * HZ; sk->sk_timer.function = x25_destroy_timer; add_timer(&sk->sk_timer); } else { /* drop last reference so sock_put will free */ __sock_put(sk); } } void x25_destroy_socket_from_timer(struct sock *sk) { sock_hold(sk); bh_lock_sock(sk); __x25_destroy_socket(sk); bh_unlock_sock(sk); sock_put(sk); } /* * Handling for system calls applied via the various interfaces to a * X.25 socket object. */ static int x25_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { int opt; struct sock *sk = sock->sk; int rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EINVAL; if (optlen < sizeof(int)) goto out; rc = -EFAULT; if (copy_from_sockptr(&opt, optval, sizeof(int))) goto out; if (opt) set_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); else clear_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = 0; out: return rc; } static int x25_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int val, len, rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EFAULT; if (get_user(len, optlen)) goto out; len = min_t(unsigned int, len, sizeof(int)); rc = -EINVAL; if (len < 0) goto out; rc = -EFAULT; if (put_user(len, optlen)) goto out; val = test_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = copy_to_user(optval, &val, len) ? -EFAULT : 0; out: return rc; } static int x25_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int rc = -EOPNOTSUPP; lock_sock(sk); if (sock->state != SS_UNCONNECTED) { rc = -EINVAL; release_sock(sk); return rc; } if (sk->sk_state != TCP_LISTEN) { memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; rc = 0; } release_sock(sk); return rc; } static struct proto x25_proto = { .name = "X25", .owner = THIS_MODULE, .obj_size = sizeof(struct x25_sock), }; static struct sock *x25_alloc_socket(struct net *net, int kern) { struct x25_sock *x25; struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern); if (!sk) goto out; sock_init_data(NULL, sk); x25 = x25_sk(sk); skb_queue_head_init(&x25->ack_queue); skb_queue_head_init(&x25->fragment_queue); skb_queue_head_init(&x25->interrupt_in_queue); skb_queue_head_init(&x25->interrupt_out_queue); out: return sk; } static int x25_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct x25_sock *x25; int rc = -EAFNOSUPPORT; if (!net_eq(net, &init_net)) goto out; rc = -ESOCKTNOSUPPORT; if (sock->type != SOCK_SEQPACKET) goto out; rc = -EINVAL; if (protocol) goto out; rc = -ENOMEM; if ((sk = x25_alloc_socket(net, kern)) == NULL) goto out; x25 = x25_sk(sk); sock_init_data(sock, sk); x25_init_timers(sk); sock->ops = &x25_proto_ops; sk->sk_protocol = protocol; sk->sk_backlog_rcv = x25_backlog_rcv; x25->t21 = sysctl_x25_call_request_timeout; x25->t22 = sysctl_x25_reset_request_timeout; x25->t23 = sysctl_x25_clear_request_timeout; x25->t2 = sysctl_x25_ack_holdback_timeout; x25->state = X25_STATE_0; x25->cudmatchlength = 0; set_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); /* normally no cud */ /* on call accept */ x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE; x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE; x25->facilities.pacsize_in = X25_DEFAULT_PACKET_SIZE; x25->facilities.pacsize_out = X25_DEFAULT_PACKET_SIZE; x25->facilities.throughput = 0; /* by default don't negotiate throughput */ x25->facilities.reverse = X25_DEFAULT_REVERSE; x25->dte_facilities.calling_len = 0; x25->dte_facilities.called_len = 0; memset(x25->dte_facilities.called_ae, '\0', sizeof(x25->dte_facilities.called_ae)); memset(x25->dte_facilities.calling_ae, '\0', sizeof(x25->dte_facilities.calling_ae)); rc = 0; out: return rc; } static struct sock *x25_make_new(struct sock *osk) { struct sock *sk = NULL; struct x25_sock *x25, *ox25; if (osk->sk_type != SOCK_SEQPACKET) goto out; if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL) goto out; x25 = x25_sk(sk); sk->sk_type = osk->sk_type; sk->sk_priority = osk->sk_priority; sk->sk_protocol = osk->sk_protocol; sk->sk_rcvbuf = osk->sk_rcvbuf; sk->sk_sndbuf = osk->sk_sndbuf; sk->sk_state = TCP_ESTABLISHED; sk->sk_backlog_rcv = osk->sk_backlog_rcv; sock_copy_flags(sk, osk); ox25 = x25_sk(osk); x25->t21 = ox25->t21; x25->t22 = ox25->t22; x25->t23 = ox25->t23; x25->t2 = ox25->t2; x25->flags = ox25->flags; x25->facilities = ox25->facilities; x25->dte_facilities = ox25->dte_facilities; x25->cudmatchlength = ox25->cudmatchlength; clear_bit(X25_INTERRUPT_FLAG, &x25->flags); x25_init_timers(sk); out: return sk; } static int x25_release(struct socket *sock) { struct sock *sk = sock->sk; struct x25_sock *x25; if (!sk) return 0; x25 = x25_sk(sk); sock_hold(sk); lock_sock(sk); switch (x25->state) { case X25_STATE_0: case X25_STATE_2: x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; case X25_STATE_1: case X25_STATE_3: case X25_STATE_4: x25_clear_queues(sk); x25_write_internal(sk, X25_CLEAR_REQUEST); x25_start_t23timer(sk); x25->state = X25_STATE_2; sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); sock_set_flag(sk, SOCK_DESTROY); break; case X25_STATE_5: x25_write_internal(sk, X25_CLEAR_REQUEST); x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; } sock_orphan(sk); out: release_sock(sk); sock_put(sk); return 0; } static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; int len, i, rc = 0; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) { rc = -EINVAL; goto out; } /* check for the null_x25_address */ if (strcmp(addr->sx25_addr.x25_addr, null_x25_address.x25_addr)) { len = strlen(addr->sx25_addr.x25_addr); for (i = 0; i < len; i++) { if (!isdigit(addr->sx25_addr.x25_addr[i])) { rc = -EINVAL; goto out; } } } lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) { x25_sk(sk)->source_addr = addr->sx25_addr; x25_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); } else { rc = -EINVAL; } release_sock(sk); SOCK_DEBUG(sk, "x25_bind: socket is bound\n"); out: return rc; } static int x25_wait_for_connection_establishment(struct sock *sk) { DECLARE_WAITQUEUE(wait, current); int rc; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = sock_error(sk); if (rc) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = -ENOTCONN; if (sk->sk_state == TCP_CLOSE) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = 0; if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); schedule(); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; struct x25_route *rt; int rc = 0; lock_sock(sk); if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { sock->state = SS_CONNECTED; goto out; /* Connect completed during a ERESTARTSYS event */ } rc = -ECONNREFUSED; if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { sock->state = SS_UNCONNECTED; goto out; } rc = -EISCONN; /* No reconnect on a seqpacket socket */ if (sk->sk_state == TCP_ESTABLISHED) goto out; rc = -EALREADY; /* Do nothing if call is already in progress */ if (sk->sk_state == TCP_SYN_SENT) goto out; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; rc = -EINVAL; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) goto out; rc = -ENETUNREACH; rt = x25_get_route(&addr->sx25_addr); if (!rt) goto out; x25->neighbour = x25_get_neigh(rt->dev); if (!x25->neighbour) goto out_put_route; x25_limit_facilities(&x25->facilities, x25->neighbour); x25->lci = x25_new_lci(x25->neighbour); if (!x25->lci) goto out_put_neigh; rc = -EINVAL; if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ goto out_put_neigh; if (!strcmp(x25->source_addr.x25_addr, null_x25_address.x25_addr)) memset(&x25->source_addr, '\0', X25_ADDR_LEN); x25->dest_addr = addr->sx25_addr; /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; x25->state = X25_STATE_1; x25_write_internal(sk, X25_CALL_REQUEST); x25_start_heartbeat(sk); x25_start_t21timer(sk); /* Now the loop */ rc = -EINPROGRESS; if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) goto out; rc = x25_wait_for_connection_establishment(sk); if (rc) goto out_put_neigh; sock->state = SS_CONNECTED; rc = 0; out_put_neigh: if (rc && x25->neighbour) { read_lock_bh(&x25_list_lock); x25_neigh_put(x25->neighbour); x25->neighbour = NULL; read_unlock_bh(&x25_list_lock); x25->state = X25_STATE_0; } out_put_route: x25_route_put(rt); out: release_sock(sk); return rc; } static int x25_wait_for_data(struct sock *sk, long timeout) { DECLARE_WAITQUEUE(wait, current); int rc = 0; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); if (sk->sk_shutdown & RCV_SHUTDOWN) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; rc = 0; if (skb_queue_empty(&sk->sk_receive_queue)) { release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk = sock->sk; struct sock *newsk; struct sk_buff *skb; int rc = -EINVAL; if (!sk) goto out; rc = -EOPNOTSUPP; if (sk->sk_type != SOCK_SEQPACKET) goto out; lock_sock(sk); rc = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out2; rc = x25_wait_for_data(sk, sk->sk_rcvtimeo); if (rc) goto out2; skb = skb_dequeue(&sk->sk_receive_queue); rc = -EINVAL; if (!skb->sk) goto out2; newsk = skb->sk; sock_graft(newsk, newsock); /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; rc = 0; out2: release_sock(sk); out: return rc; } static int x25_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)uaddr; struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); int rc = 0; if (peer) { if (sk->sk_state != TCP_ESTABLISHED) { rc = -ENOTCONN; goto out; } sx25->sx25_addr = x25->dest_addr; } else sx25->sx25_addr = x25->source_addr; sx25->sx25_family = AF_X25; rc = sizeof(*sx25); out: return rc; } int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, unsigned int lci) { struct sock *sk; struct sock *make; struct x25_sock *makex25; struct x25_address source_addr, dest_addr; struct x25_facilities facilities; struct x25_dte_facilities dte_facilities; int len, addr_len, rc; /* * Remove the LCI and frame type. */ skb_pull(skb, X25_STD_MIN_LEN); /* * Extract the X.25 addresses and convert them to ASCII strings, * and remove them. * * Address block is mandatory in call request packets */ addr_len = x25_parse_address_block(skb, &source_addr, &dest_addr); if (addr_len <= 0) goto out_clear_request; skb_pull(skb, addr_len); /* * Get the length of the facilities, skip past them for the moment * get the call user data because this is needed to determine * the correct listener * * Facilities length is mandatory in call request packets */ if (!pskb_may_pull(skb, 1)) goto out_clear_request; len = skb->data[0] + 1; if (!pskb_may_pull(skb, len)) goto out_clear_request; skb_pull(skb,len); /* * Ensure that the amount of call user data is valid. */ if (skb->len > X25_MAX_CUD_LEN) goto out_clear_request; /* * Get all the call user data so it can be used in * x25_find_listener and skb_copy_from_linear_data up ahead. */ if (!pskb_may_pull(skb, skb->len)) goto out_clear_request; /* * Find a listener for the particular address/cud pair. */ sk = x25_find_listener(&source_addr,skb); skb_push(skb,len); if (sk != NULL && sk_acceptq_is_full(sk)) { goto out_sock_put; } /* * We dont have any listeners for this incoming call. * Try forwarding it. */ if (sk == NULL) { skb_push(skb, addr_len + X25_STD_MIN_LEN); if (sysctl_x25_forward && x25_forward_call(&dest_addr, nb, skb, lci) > 0) { /* Call was forwarded, dont process it any more */ kfree_skb(skb); rc = 1; goto out; } else { /* No listeners, can't forward, clear the call */ goto out_clear_request; } } /* * Try to reach a compromise on the requested facilities. */ len = x25_negotiate_facilities(skb, sk, &facilities, &dte_facilities); if (len == -1) goto out_sock_put; /* * current neighbour/link might impose additional limits * on certain facilities */ x25_limit_facilities(&facilities, nb); /* * Try to create a new socket. */ make = x25_make_new(sk); if (!make) goto out_sock_put; /* * Remove the facilities */ skb_pull(skb, len); skb->sk = make; make->sk_state = TCP_ESTABLISHED; makex25 = x25_sk(make); makex25->lci = lci; makex25->dest_addr = dest_addr; makex25->source_addr = source_addr; x25_neigh_hold(nb); makex25->neighbour = nb; makex25->facilities = facilities; makex25->dte_facilities= dte_facilities; makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask; /* ensure no reverse facil on accept */ makex25->vc_facil_mask &= ~X25_MASK_REVERSE; /* ensure no calling address extension on accept */ makex25->vc_facil_mask &= ~X25_MASK_CALLING_AE; makex25->cudmatchlength = x25_sk(sk)->cudmatchlength; /* Normally all calls are accepted immediately */ if (test_bit(X25_ACCPT_APPRV_FLAG, &makex25->flags)) { x25_write_internal(make, X25_CALL_ACCEPTED); makex25->state = X25_STATE_3; } else { makex25->state = X25_STATE_5; } /* * Incoming Call User Data. */ skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); makex25->calluserdata.cudlength = skb->len; sk_acceptq_added(sk); x25_insert_socket(make); skb_queue_head(&sk->sk_receive_queue, skb); x25_start_heartbeat(make); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); rc = 1; sock_put(sk); out: return rc; out_sock_put: sock_put(sk); out_clear_request: rc = 0; x25_transmit_clear_request(nb, lci, 0x01); goto out; } static int x25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, usx25, msg->msg_name); struct sockaddr_x25 sx25; struct sk_buff *skb; unsigned char *asmptr; int noblock = msg->msg_flags & MSG_DONTWAIT; size_t size; int qbit = 0, rc = -EINVAL; lock_sock(sk); if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_OOB|MSG_EOR|MSG_CMSG_COMPAT)) goto out; /* we currently don't support segmented records at the user interface */ if (!(msg->msg_flags & (MSG_EOR|MSG_OOB))) goto out; rc = -EADDRNOTAVAIL; if (sock_flag(sk, SOCK_ZAPPED)) goto out; rc = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) { send_sig(SIGPIPE, current, 0); goto out; } rc = -ENETUNREACH; if (!x25->neighbour) goto out; if (usx25) { rc = -EINVAL; if (msg->msg_namelen < sizeof(sx25)) goto out; memcpy(&sx25, usx25, sizeof(sx25)); rc = -EISCONN; if (strcmp(x25->dest_addr.x25_addr, sx25.sx25_addr.x25_addr)) goto out; rc = -EINVAL; if (sx25.sx25_family != AF_X25) goto out; } else { /* * FIXME 1003.1g - if the socket is like this because * it has become closed (not started closed) we ought * to SIGPIPE, EPIPE; */ rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out; sx25.sx25_family = AF_X25; sx25.sx25_addr = x25->dest_addr; } /* Sanity check the packet size */ if (len > 65535) { rc = -EMSGSIZE; goto out; } SOCK_DEBUG(sk, "x25_sendmsg: sendto: Addresses built.\n"); /* Build a packet */ SOCK_DEBUG(sk, "x25_sendmsg: sendto: building packet.\n"); if ((msg->msg_flags & MSG_OOB) && len > 32) len = 32; size = len + X25_MAX_L2_LEN + X25_EXT_MIN_LEN; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (!skb) goto out; X25_SKB_CB(skb)->flags = msg->msg_flags; skb_reserve(skb, X25_MAX_L2_LEN + X25_EXT_MIN_LEN); /* * Put the data on the end */ SOCK_DEBUG(sk, "x25_sendmsg: Copying user data\n"); skb_reset_transport_header(skb); skb_put(skb, len); rc = memcpy_from_msg(skb_transport_header(skb), msg, len); if (rc) goto out_kfree_skb; /* * If the Q BIT Include socket option is in force, the first * byte of the user data is the logical value of the Q Bit. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { if (!pskb_may_pull(skb, 1)) goto out_kfree_skb; qbit = skb->data[0]; skb_pull(skb, 1); } /* * Push down the X.25 header */ SOCK_DEBUG(sk, "x25_sendmsg: Building X.25 Header.\n"); if (msg->msg_flags & MSG_OOB) { if (x25->neighbour->extended) { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } else { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } } else { if (x25->neighbour->extended) { /* Build an Extended X.25 header */ asmptr = skb_push(skb, X25_EXT_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; *asmptr++ = X25_DATA; } else { /* Build an Standard X.25 header */ asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; } if (qbit) skb->data[0] |= X25_Q_BIT; } SOCK_DEBUG(sk, "x25_sendmsg: Built header.\n"); SOCK_DEBUG(sk, "x25_sendmsg: Transmitting buffer\n"); rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out_kfree_skb; if (msg->msg_flags & MSG_OOB) skb_queue_tail(&x25->interrupt_out_queue, skb); else { rc = x25_output(sk, skb); len = rc; if (rc < 0) kfree_skb(skb); else if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) len++; } x25_kick(sk); rc = len; out: release_sock(sk); return rc; out_kfree_skb: kfree_skb(skb); goto out; } static int x25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, sx25, msg->msg_name); size_t copied; int qbit, header_len; struct sk_buff *skb; unsigned char *asmptr; int rc = -ENOTCONN; lock_sock(sk); if (x25->neighbour == NULL) goto out; header_len = x25->neighbour->extended ? X25_EXT_MIN_LEN : X25_STD_MIN_LEN; /* * This works for seqpacket too. The receiver has ordered the queue for * us! We do one quick check first though */ if (sk->sk_state != TCP_ESTABLISHED) goto out; if (flags & MSG_OOB) { rc = -EINVAL; if (sock_flag(sk, SOCK_URGINLINE) || !skb_peek(&x25->interrupt_in_queue)) goto out; skb = skb_dequeue(&x25->interrupt_in_queue); if (!pskb_may_pull(skb, X25_STD_MIN_LEN)) goto out_free_dgram; skb_pull(skb, X25_STD_MIN_LEN); /* * No Q bit information on Interrupt data. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = 0x00; } msg->msg_flags |= MSG_OOB; } else { /* Now we can treat all alike */ release_sock(sk); skb = skb_recv_datagram(sk, flags, &rc); lock_sock(sk); if (!skb) goto out; if (!pskb_may_pull(skb, header_len)) goto out_free_dgram; qbit = (skb->data[0] & X25_Q_BIT) == X25_Q_BIT; skb_pull(skb, header_len); if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = qbit; } } skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } /* Currently, each datagram always contains a complete record */ msg->msg_flags |= MSG_EOR; rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc) goto out_free_dgram; if (sx25) { sx25->sx25_family = AF_X25; sx25->sx25_addr = x25->dest_addr; msg->msg_namelen = sizeof(*sx25); } x25_check_rbuf(sk); rc = copied; out_free_dgram: skb_free_datagram(sk, skb); out: release_sock(sk); return rc; } static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); void __user *argp = (void __user *)arg; int rc; switch (cmd) { case TIOCOUTQ: { int amount; amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; rc = put_user(amount, (unsigned int __user *)argp); break; } case TIOCINQ: { struct sk_buff *skb; int amount = 0; /* * These two are safe on a single CPU system as * only user tasks fiddle here */ lock_sock(sk); if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; release_sock(sk); rc = put_user(amount, (unsigned int __user *)argp); break; } case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->facilities, sizeof(x25->facilities)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SFACILITIES: { struct x25_facilities facilities; rc = -EFAULT; if (copy_from_user(&facilities, argp, sizeof(facilities))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_fac_release; if (facilities.pacsize_in < X25_PS16 || facilities.pacsize_in > X25_PS4096) goto out_fac_release; if (facilities.pacsize_out < X25_PS16 || facilities.pacsize_out > X25_PS4096) goto out_fac_release; if (facilities.winsize_in < 1 || facilities.winsize_in > 127) goto out_fac_release; if (facilities.throughput) { int out = facilities.throughput & 0xf0; int in = facilities.throughput & 0x0f; if (!out) facilities.throughput |= X25_DEFAULT_THROUGHPUT << 4; else if (out < 0x30 || out > 0xD0) goto out_fac_release; if (!in) facilities.throughput |= X25_DEFAULT_THROUGHPUT; else if (in < 0x03 || in > 0x0D) goto out_fac_release; } if (facilities.reverse && (facilities.reverse & 0x81) != 0x81) goto out_fac_release; x25->facilities = facilities; rc = 0; out_fac_release: release_sock(sk); break; } case SIOCX25GDTEFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->dte_facilities, sizeof(x25->dte_facilities)); release_sock(sk); if (rc) rc = -EFAULT; break; } case SIOCX25SDTEFACILITIES: { struct x25_dte_facilities dtefacs; rc = -EFAULT; if (copy_from_user(&dtefacs, argp, sizeof(dtefacs))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_dtefac_release; if (dtefacs.calling_len > X25_MAX_AE_LEN) goto out_dtefac_release; if (dtefacs.called_len > X25_MAX_AE_LEN) goto out_dtefac_release; x25->dte_facilities = dtefacs; rc = 0; out_dtefac_release: release_sock(sk); break; } case SIOCX25GCALLUSERDATA: { lock_sock(sk); rc = copy_to_user(argp, &x25->calluserdata, sizeof(x25->calluserdata)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCALLUSERDATA: { struct x25_calluserdata calluserdata; rc = -EFAULT; if (copy_from_user(&calluserdata, argp, sizeof(calluserdata))) break; rc = -EINVAL; if (calluserdata.cudlength > X25_MAX_CUD_LEN) break; lock_sock(sk); x25->calluserdata = calluserdata; release_sock(sk); rc = 0; break; } case SIOCX25GCAUSEDIAG: { lock_sock(sk); rc = copy_to_user(argp, &x25->causediag, sizeof(x25->causediag)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCAUSEDIAG: { struct x25_causediag causediag; rc = -EFAULT; if (copy_from_user(&causediag, argp, sizeof(causediag))) break; lock_sock(sk); x25->causediag = causediag; release_sock(sk); rc = 0; break; } case SIOCX25SCUDMATCHLEN: { struct x25_subaddr sub_addr; rc = -EINVAL; lock_sock(sk); if(sk->sk_state != TCP_CLOSE) goto out_cud_release; rc = -EFAULT; if (copy_from_user(&sub_addr, argp, sizeof(sub_addr))) goto out_cud_release; rc = -EINVAL; if (sub_addr.cudmatchlength > X25_MAX_CUD_LEN) goto out_cud_release; x25->cudmatchlength = sub_addr.cudmatchlength; rc = 0; out_cud_release: release_sock(sk); break; } case SIOCX25CALLACCPTAPPRV: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state == TCP_CLOSE) { clear_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); rc = 0; } release_sock(sk); break; } case SIOCX25SENDCALLACCPT: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_ESTABLISHED) goto out_sendcallaccpt_release; /* must call accptapprv above */ if (test_bit(X25_ACCPT_APPRV_FLAG, &x25->flags)) goto out_sendcallaccpt_release; x25_write_internal(sk, X25_CALL_ACCEPTED); x25->state = X25_STATE_3; rc = 0; out_sendcallaccpt_release: release_sock(sk); break; } default: rc = -ENOIOCTLCMD; break; } return rc; } static const struct net_proto_family x25_family_ops = { .family = AF_X25, .create = x25_create, .owner = THIS_MODULE, }; #ifdef CONFIG_COMPAT static int compat_x25_subscr_ioctl(unsigned int cmd, struct compat_x25_subscrip_struct __user *x25_subscr32) { struct compat_x25_subscrip_struct x25_subscr; struct x25_neigh *nb; struct net_device *dev; int rc = -EINVAL; rc = -EFAULT; if (copy_from_user(&x25_subscr, x25_subscr32, sizeof(*x25_subscr32))) goto out; rc = -EINVAL; dev = x25_dev_get(x25_subscr.device); if (dev == NULL) goto out; nb = x25_get_neigh(dev); if (nb == NULL) goto out_dev_put; dev_put(dev); if (cmd == SIOCX25GSUBSCRIP) { read_lock_bh(&x25_neigh_list_lock); x25_subscr.extended = nb->extended; x25_subscr.global_facil_mask = nb->global_facil_mask; read_unlock_bh(&x25_neigh_list_lock); rc = copy_to_user(x25_subscr32, &x25_subscr, sizeof(*x25_subscr32)) ? -EFAULT : 0; } else { rc = -EINVAL; if (x25_subscr.extended == 0 || x25_subscr.extended == 1) { rc = 0; write_lock_bh(&x25_neigh_list_lock); nb->extended = x25_subscr.extended; nb->global_facil_mask = x25_subscr.global_facil_mask; write_unlock_bh(&x25_neigh_list_lock); } } x25_neigh_put(nb); out: return rc; out_dev_put: dev_put(dev); goto out; } static int compat_x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); int rc = -ENOIOCTLCMD; switch(cmd) { case TIOCOUTQ: case TIOCINQ: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: case SIOCX25SFACILITIES: case SIOCX25GDTEFACILITIES: case SIOCX25SDTEFACILITIES: case SIOCX25GCALLUSERDATA: case SIOCX25SCALLUSERDATA: case SIOCX25GCAUSEDIAG: case SIOCX25SCAUSEDIAG: case SIOCX25SCUDMATCHLEN: case SIOCX25CALLACCPTAPPRV: case SIOCX25SENDCALLACCPT: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; default: rc = -ENOIOCTLCMD; break; } return rc; } #endif static const struct proto_ops x25_proto_ops = { .family = AF_X25, .owner = THIS_MODULE, .release = x25_release, .bind = x25_bind, .connect = x25_connect, .socketpair = sock_no_socketpair, .accept = x25_accept, .getname = x25_getname, .poll = datagram_poll, .ioctl = x25_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, #endif .gettstamp = sock_gettstamp, .listen = x25_listen, .shutdown = sock_no_shutdown, .setsockopt = x25_setsockopt, .getsockopt = x25_getsockopt, .sendmsg = x25_sendmsg, .recvmsg = x25_recvmsg, .mmap = sock_no_mmap, }; static struct packet_type x25_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_X25), .func = x25_lapb_receive_frame, }; static struct notifier_block x25_dev_notifier = { .notifier_call = x25_device_event, }; void x25_kill_by_neigh(struct x25_neigh *nb) { struct sock *s; write_lock_bh(&x25_list_lock); sk_for_each(s, &x25_list) { if (x25_sk(s)->neighbour == nb) { write_unlock_bh(&x25_list_lock); lock_sock(s); x25_disconnect(s, ENETUNREACH, 0, 0); release_sock(s); write_lock_bh(&x25_list_lock); } } write_unlock_bh(&x25_list_lock); /* Remove any related forwards */ x25_clear_forward_by_dev(nb->dev); } static int __init x25_init(void) { int rc; rc = proto_register(&x25_proto, 0); if (rc) goto out; rc = sock_register(&x25_family_ops); if (rc) goto out_proto; dev_add_pack(&x25_packet_type); rc = register_netdevice_notifier(&x25_dev_notifier); if (rc) goto out_sock; rc = x25_register_sysctl(); if (rc) goto out_dev; rc = x25_proc_init(); if (rc) goto out_sysctl; pr_info("Linux Version 0.2\n"); out: return rc; out_sysctl: x25_unregister_sysctl(); out_dev: unregister_netdevice_notifier(&x25_dev_notifier); out_sock: dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); out_proto: proto_unregister(&x25_proto); goto out; } module_init(x25_init); static void __exit x25_exit(void) { x25_proc_exit(); x25_link_free(); x25_route_free(); x25_unregister_sysctl(); unregister_netdevice_notifier(&x25_dev_notifier); dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); proto_unregister(&x25_proto); } module_exit(x25_exit); MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The X.25 Packet Layer network layer protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_X25);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 /* * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef _TLS_OFFLOAD_H #define _TLS_OFFLOAD_H #include <linux/types.h> #include <asm/byteorder.h> #include <linux/crypto.h> #include <linux/socket.h> #include <linux/tcp.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/tcp.h> #include <net/strparser.h> #include <crypto/aead.h> #include <uapi/linux/tls.h> struct tls_rec; /* Maximum data size carried in a TLS record */ #define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14) #define TLS_HEADER_SIZE 5 #define TLS_NONCE_OFFSET TLS_HEADER_SIZE #define TLS_CRYPTO_INFO_READY(info) ((info)->cipher_type) #define TLS_AAD_SPACE_SIZE 13 #define MAX_IV_SIZE 16 #define TLS_TAG_SIZE 16 #define TLS_MAX_REC_SEQ_SIZE 8 #define TLS_MAX_AAD_SIZE TLS_AAD_SPACE_SIZE /* For CCM mode, the full 16-bytes of IV is made of '4' fields of given sizes. * * IV[16] = b0[1] || implicit nonce[4] || explicit nonce[8] || length[3] * * The field 'length' is encoded in field 'b0' as '(length width - 1)'. * Hence b0 contains (3 - 1) = 2. */ #define TLS_AES_CCM_IV_B0_BYTE 2 #define TLS_SM4_CCM_IV_B0_BYTE 2 enum { TLS_BASE, TLS_SW, TLS_HW, TLS_HW_RECORD, TLS_NUM_CONFIG, }; struct tx_work { struct delayed_work work; struct sock *sk; }; struct tls_sw_context_tx { struct crypto_aead *aead_send; struct crypto_wait async_wait; struct tx_work tx_work; struct tls_rec *open_rec; struct list_head tx_list; atomic_t encrypt_pending; /* protect crypto_wait with encrypt_pending */ spinlock_t encrypt_compl_lock; int async_notify; u8 async_capable:1; #define BIT_TX_SCHEDULED 0 #define BIT_TX_CLOSING 1 unsigned long tx_bitmask; }; struct tls_strparser { struct sock *sk; u32 mark : 8; u32 stopped : 1; u32 copy_mode : 1; u32 mixed_decrypted : 1; u32 msg_ready : 1; struct strp_msg stm; struct sk_buff *anchor; struct work_struct work; }; struct tls_sw_context_rx { struct crypto_aead *aead_recv; struct crypto_wait async_wait; struct sk_buff_head rx_list; /* list of decrypted 'data' records */ void (*saved_data_ready)(struct sock *sk); u8 reader_present; u8 async_capable:1; u8 zc_capable:1; u8 reader_contended:1; struct tls_strparser strp; atomic_t decrypt_pending; /* protect crypto_wait with decrypt_pending*/ spinlock_t decrypt_compl_lock; struct sk_buff_head async_hold; struct wait_queue_head wq; }; struct tls_record_info { struct list_head list; u32 end_seq; int len; int num_frags; skb_frag_t frags[MAX_SKB_FRAGS]; }; struct tls_offload_context_tx { struct crypto_aead *aead_send; spinlock_t lock; /* protects records list */ struct list_head records_list; struct tls_record_info *open_record; struct tls_record_info *retransmit_hint; u64 hint_record_sn; u64 unacked_record_sn; struct scatterlist sg_tx_data[MAX_SKB_FRAGS]; void (*sk_destruct)(struct sock *sk); struct work_struct destruct_work; struct tls_context *ctx; u8 driver_state[] __aligned(8); /* The TLS layer reserves room for driver specific state * Currently the belief is that there is not enough * driver specific state to justify another layer of indirection */ #define TLS_DRIVER_STATE_SIZE_TX 16 }; #define TLS_OFFLOAD_CONTEXT_SIZE_TX \ (sizeof(struct tls_offload_context_tx) + TLS_DRIVER_STATE_SIZE_TX) enum tls_context_flags { /* tls_device_down was called after the netdev went down, device state * was released, and kTLS works in software, even though rx_conf is * still TLS_HW (needed for transition). */ TLS_RX_DEV_DEGRADED = 0, /* Unlike RX where resync is driven entirely by the core in TX only * the driver knows when things went out of sync, so we need the flag * to be atomic. */ TLS_TX_SYNC_SCHED = 1, /* tls_dev_del was called for the RX side, device state was released, * but tls_ctx->netdev might still be kept, because TX-side driver * resources might not be released yet. Used to prevent the second * tls_dev_del call in tls_device_down if it happens simultaneously. */ TLS_RX_DEV_CLOSED = 2, }; struct cipher_context { char *iv; char *rec_seq; }; union tls_crypto_context { struct tls_crypto_info info; union { struct tls12_crypto_info_aes_gcm_128 aes_gcm_128; struct tls12_crypto_info_aes_gcm_256 aes_gcm_256; struct tls12_crypto_info_chacha20_poly1305 chacha20_poly1305; struct tls12_crypto_info_sm4_gcm sm4_gcm; struct tls12_crypto_info_sm4_ccm sm4_ccm; }; }; struct tls_prot_info { u16 version; u16 cipher_type; u16 prepend_size; u16 tag_size; u16 overhead_size; u16 iv_size; u16 salt_size; u16 rec_seq_size; u16 aad_size; u16 tail_size; }; struct tls_context { /* read-only cache line */ struct tls_prot_info prot_info; u8 tx_conf:3; u8 rx_conf:3; u8 zerocopy_sendfile:1; u8 rx_no_pad:1; int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); void *priv_ctx_tx; void *priv_ctx_rx; struct net_device __rcu *netdev; /* rw cache line */ struct cipher_context tx; struct cipher_context rx; struct scatterlist *partially_sent_record; u16 partially_sent_offset; bool splicing_pages; bool pending_open_record_frags; struct mutex tx_lock; /* protects partially_sent_* fields and * per-type TX fields */ unsigned long flags; /* cache cold stuff */ struct proto *sk_proto; struct sock *sk; void (*sk_destruct)(struct sock *sk); union tls_crypto_context crypto_send; union tls_crypto_context crypto_recv; struct list_head list; refcount_t refcount; struct rcu_head rcu; }; enum tls_offload_ctx_dir { TLS_OFFLOAD_CTX_DIR_RX, TLS_OFFLOAD_CTX_DIR_TX, }; struct tlsdev_ops { int (*tls_dev_add)(struct net_device *netdev, struct sock *sk, enum tls_offload_ctx_dir direction, struct tls_crypto_info *crypto_info, u32 start_offload_tcp_sn); void (*tls_dev_del)(struct net_device *netdev, struct tls_context *ctx, enum tls_offload_ctx_dir direction); int (*tls_dev_resync)(struct net_device *netdev, struct sock *sk, u32 seq, u8 *rcd_sn, enum tls_offload_ctx_dir direction); }; enum tls_offload_sync_type { TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ = 0, TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT = 1, TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC = 2, }; #define TLS_DEVICE_RESYNC_NH_START_IVAL 2 #define TLS_DEVICE_RESYNC_NH_MAX_IVAL 128 #define TLS_DEVICE_RESYNC_ASYNC_LOGMAX 13 struct tls_offload_resync_async { atomic64_t req; u16 loglen; u16 rcd_delta; u32 log[TLS_DEVICE_RESYNC_ASYNC_LOGMAX]; }; struct tls_offload_context_rx { /* sw must be the first member of tls_offload_context_rx */ struct tls_sw_context_rx sw; enum tls_offload_sync_type resync_type; /* this member is set regardless of resync_type, to avoid branches */ u8 resync_nh_reset:1; /* CORE_NEXT_HINT-only member, but use the hole here */ u8 resync_nh_do_now:1; union { /* TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ */ struct { atomic64_t resync_req; }; /* TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT */ struct { u32 decrypted_failed; u32 decrypted_tgt; } resync_nh; /* TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC */ struct { struct tls_offload_resync_async *resync_async; }; }; u8 driver_state[] __aligned(8); /* The TLS layer reserves room for driver specific state * Currently the belief is that there is not enough * driver specific state to justify another layer of indirection */ #define TLS_DRIVER_STATE_SIZE_RX 8 }; #define TLS_OFFLOAD_CONTEXT_SIZE_RX \ (sizeof(struct tls_offload_context_rx) + TLS_DRIVER_STATE_SIZE_RX) struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context, u32 seq, u64 *p_record_sn); static inline bool tls_record_is_start_marker(struct tls_record_info *rec) { return rec->len == 0; } static inline u32 tls_record_start_seq(struct tls_record_info *rec) { return rec->end_seq - rec->len; } struct sk_buff * tls_validate_xmit_skb(struct sock *sk, struct net_device *dev, struct sk_buff *skb); struct sk_buff * tls_validate_xmit_skb_sw(struct sock *sk, struct net_device *dev, struct sk_buff *skb); static inline bool tls_is_skb_tx_device_offloaded(const struct sk_buff *skb) { #ifdef CONFIG_TLS_DEVICE struct sock *sk = skb->sk; return sk && sk_fullsock(sk) && (smp_load_acquire(&sk->sk_validate_xmit_skb) == &tls_validate_xmit_skb); #else return false; #endif } static inline struct tls_context *tls_get_ctx(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); /* Use RCU on icsk_ulp_data only for sock diag code, * TLS data path doesn't need rcu_dereference(). */ return (__force void *)icsk->icsk_ulp_data; } static inline struct tls_sw_context_rx *tls_sw_ctx_rx( const struct tls_context *tls_ctx) { return (struct tls_sw_context_rx *)tls_ctx->priv_ctx_rx; } static inline struct tls_sw_context_tx *tls_sw_ctx_tx( const struct tls_context *tls_ctx) { return (struct tls_sw_context_tx *)tls_ctx->priv_ctx_tx; } static inline struct tls_offload_context_tx * tls_offload_ctx_tx(const struct tls_context *tls_ctx) { return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx; } static inline bool tls_sw_has_ctx_tx(const struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); if (!ctx) return false; return !!tls_sw_ctx_tx(ctx); } static inline bool tls_sw_has_ctx_rx(const struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); if (!ctx) return false; return !!tls_sw_ctx_rx(ctx); } static inline struct tls_offload_context_rx * tls_offload_ctx_rx(const struct tls_context *tls_ctx) { return (struct tls_offload_context_rx *)tls_ctx->priv_ctx_rx; } static inline void *__tls_driver_ctx(struct tls_context *tls_ctx, enum tls_offload_ctx_dir direction) { if (direction == TLS_OFFLOAD_CTX_DIR_TX) return tls_offload_ctx_tx(tls_ctx)->driver_state; else return tls_offload_ctx_rx(tls_ctx)->driver_state; } static inline void * tls_driver_ctx(const struct sock *sk, enum tls_offload_ctx_dir direction) { return __tls_driver_ctx(tls_get_ctx(sk), direction); } #define RESYNC_REQ BIT(0) #define RESYNC_REQ_ASYNC BIT(1) /* The TLS context is valid until sk_destruct is called */ static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); atomic64_set(&rx_ctx->resync_req, ((u64)ntohl(seq) << 32) | RESYNC_REQ); } /* Log all TLS record header TCP sequences in [seq, seq+len] */ static inline void tls_offload_rx_resync_async_request_start(struct sock *sk, __be32 seq, u16 len) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); atomic64_set(&rx_ctx->resync_async->req, ((u64)ntohl(seq) << 32) | ((u64)len << 16) | RESYNC_REQ | RESYNC_REQ_ASYNC); rx_ctx->resync_async->loglen = 0; rx_ctx->resync_async->rcd_delta = 0; } static inline void tls_offload_rx_resync_async_request_end(struct sock *sk, __be32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); atomic64_set(&rx_ctx->resync_async->req, ((u64)ntohl(seq) << 32) | RESYNC_REQ); } static inline void tls_offload_rx_resync_set_type(struct sock *sk, enum tls_offload_sync_type type) { struct tls_context *tls_ctx = tls_get_ctx(sk); tls_offload_ctx_rx(tls_ctx)->resync_type = type; } /* Driver's seq tracking has to be disabled until resync succeeded */ static inline bool tls_offload_tx_resync_pending(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); bool ret; ret = test_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags); smp_mb__after_atomic(); return ret; } struct sk_buff *tls_encrypt_skb(struct sk_buff *skb); #ifdef CONFIG_TLS_DEVICE void tls_device_sk_destruct(struct sock *sk); void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq); static inline bool tls_is_sk_rx_device_offloaded(struct sock *sk) { if (!sk_fullsock(sk) || smp_load_acquire(&sk->sk_destruct) != tls_device_sk_destruct) return false; return tls_get_ctx(sk)->rx_conf == TLS_HW; } #endif #endif /* _TLS_OFFLOAD_H */
8 8 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/proc/net.c * * Copyright (C) 2007 * * Author: Eric Biederman <ebiederm@xmission.com> * * proc net directory handling functions */ #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/module.h> #include <linux/bitops.h> #include <linux/mount.h> #include <linux/nsproxy.h> #include <linux/uidgid.h> #include <net/net_namespace.h> #include <linux/seq_file.h> #include "internal.h" static inline struct net *PDE_NET(struct proc_dir_entry *pde) { return pde->parent->data; } static struct net *get_proc_net(const struct inode *inode) { return maybe_get_net(PDE_NET(PDE(inode))); } static int seq_open_net(struct inode *inode, struct file *file) { unsigned int state_size = PDE(inode)->state_size; struct seq_net_private *p; struct net *net; WARN_ON_ONCE(state_size < sizeof(*p)); if (file->f_mode & FMODE_WRITE && !PDE(inode)->write) return -EACCES; net = get_proc_net(inode); if (!net) return -ENXIO; p = __seq_open_private(file, PDE(inode)->seq_ops, state_size); if (!p) { put_net(net); return -ENOMEM; } #ifdef CONFIG_NET_NS p->net = net; netns_tracker_alloc(net, &p->ns_tracker, GFP_KERNEL); #endif return 0; } static void seq_file_net_put_net(struct seq_file *seq) { #ifdef CONFIG_NET_NS struct seq_net_private *priv = seq->private; put_net_track(priv->net, &priv->ns_tracker); #else put_net(&init_net); #endif } static int seq_release_net(struct inode *ino, struct file *f) { struct seq_file *seq = f->private_data; seq_file_net_put_net(seq); seq_release_private(ino, f); return 0; } static const struct proc_ops proc_net_seq_ops = { .proc_open = seq_open_net, .proc_read = seq_read, .proc_write = proc_simple_write, .proc_lseek = seq_lseek, .proc_release = seq_release_net, }; int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux) { #ifdef CONFIG_NET_NS struct seq_net_private *p = priv_data; p->net = get_net_track(current->nsproxy->net_ns, &p->ns_tracker, GFP_KERNEL); #endif return 0; } void bpf_iter_fini_seq_net(void *priv_data) { #ifdef CONFIG_NET_NS struct seq_net_private *p = priv_data; put_net_track(p->net, &p->ns_tracker); #endif } struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, unsigned int state_size, void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_seq_ops; p->seq_ops = ops; p->state_size = state_size; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_data); /** * proc_create_net_data_write - Create a writable net_ns-specific proc file * @name: The name of the file. * @mode: The file's access mode. * @parent: The parent directory in which to create. * @ops: The seq_file ops with which to read the file. * @write: The write method with which to 'modify' the file. * @data: Data for retrieval by pde_data(). * * Create a network namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a * series of elements and also provides for the file accepting writes that have * some arbitrary effect. * * The functions in the @ops table are used to iterate over items to be * presented and extract the readable content using the seq_file interface. * * The @write function is called with the data copied into a kernel space * scratch buffer and has a NUL appended for convenience. The buffer may be * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, proc_write_t write, unsigned int state_size, void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_seq_ops; p->seq_ops = ops; p->state_size = state_size; p->write = write; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_data_write); static int single_open_net(struct inode *inode, struct file *file) { struct proc_dir_entry *de = PDE(inode); struct net *net; int err; net = get_proc_net(inode); if (!net) return -ENXIO; err = single_open(file, de->single_show, net); if (err) put_net(net); return err; } static int single_release_net(struct inode *ino, struct file *f) { struct seq_file *seq = f->private_data; put_net(seq->private); return single_release(ino, f); } static const struct proc_ops proc_net_single_ops = { .proc_open = single_open_net, .proc_read = seq_read, .proc_write = proc_simple_write, .proc_lseek = seq_lseek, .proc_release = single_release_net, }; struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, struct proc_dir_entry *parent, int (*show)(struct seq_file *, void *), void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_single_ops; p->single_show = show; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_single); /** * proc_create_net_single_write - Create a writable net_ns-specific proc file * @name: The name of the file. * @mode: The file's access mode. * @parent: The parent directory in which to create. * @show: The seqfile show method with which to read the file. * @write: The write method with which to 'modify' the file. * @data: Data for retrieval by pde_data(). * * Create a network-namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a * single element rather than a series and also provides for the file accepting * writes that have some arbitrary effect. * * The @show function is called to extract the readable content via the * seq_file interface. * * The @write function is called with the data copied into a kernel space * scratch buffer and has a NUL appended for convenience. The buffer may be * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_single_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode, struct proc_dir_entry *parent, int (*show)(struct seq_file *, void *), proc_write_t write, void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; pde_force_lookup(p); p->proc_ops = &proc_net_single_ops; p->single_show = show; p->write = write; return proc_register(parent, p); } EXPORT_SYMBOL_GPL(proc_create_net_single_write); static struct net *get_proc_task_net(struct inode *dir) { struct task_struct *task; struct nsproxy *ns; struct net *net = NULL; rcu_read_lock(); task = pid_task(proc_pid(dir), PIDTYPE_PID); if (task != NULL) { task_lock(task); ns = task->nsproxy; if (ns != NULL) net = get_net(ns->net_ns); task_unlock(task); } rcu_read_unlock(); return net; } static struct dentry *proc_tgid_net_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct dentry *de; struct net *net; de = ERR_PTR(-ENOENT); net = get_proc_task_net(dir); if (net != NULL) { de = proc_lookup_de(dir, dentry, net->proc_net); put_net(net); } return de; } static int proc_tgid_net_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct net *net; net = get_proc_task_net(inode); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (net != NULL) { stat->nlink = net->proc_net->nlink; put_net(net); } return 0; } const struct inode_operations proc_net_inode_operations = { .lookup = proc_tgid_net_lookup, .getattr = proc_tgid_net_getattr, .setattr = proc_setattr, }; static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) { int ret; struct net *net; ret = -EINVAL; net = get_proc_task_net(file_inode(file)); if (net != NULL) { ret = proc_readdir_de(file, ctx, net->proc_net); put_net(net); } return ret; } const struct file_operations proc_net_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = proc_tgid_net_readdir, }; static __net_init int proc_net_ns_init(struct net *net) { struct proc_dir_entry *netd, *net_statd; kuid_t uid; kgid_t gid; int err; /* * This PDE acts only as an anchor for /proc/${pid}/net hierarchy. * Corresponding inode (PDE(inode) == net->proc_net) is never * instantiated therefore blanket zeroing is fine. * net->proc_net_stat inode is instantiated normally. */ err = -ENOMEM; netd = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!netd) goto out; netd->subdir = RB_ROOT; netd->data = net; netd->nlink = 2; netd->namelen = 3; netd->parent = &proc_root; netd->name = netd->inline_name; memcpy(netd->name, "net", 4); uid = make_kuid(net->user_ns, 0); if (!uid_valid(uid)) uid = netd->uid; gid = make_kgid(net->user_ns, 0); if (!gid_valid(gid)) gid = netd->gid; proc_set_user(netd, uid, gid); /* Seed dentry revalidation for /proc/${pid}/net */ pde_force_lookup(netd); err = -EEXIST; net_statd = proc_net_mkdir(net, "stat", netd); if (!net_statd) goto free_net; net->proc_net = netd; net->proc_net_stat = net_statd; return 0; free_net: pde_free(netd); out: return err; } static __net_exit void proc_net_ns_exit(struct net *net) { remove_proc_entry("stat", net->proc_net); pde_free(net->proc_net); } static struct pernet_operations __net_initdata proc_net_ns_ops = { .init = proc_net_ns_init, .exit = proc_net_ns_exit, }; int __init proc_net_init(void) { proc_symlink("net", NULL, "self/net"); return register_pernet_subsys(&proc_net_ns_ops); }
2 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 // SPDX-License-Identifier: GPL-2.0 /* Watch queue and general notification mechanism, built on pipes * * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/core-api/watch_queue.rst */ #define pr_fmt(fmt) "watchq: " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/printk.h> #include <linux/miscdevice.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/poll.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/file.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/sched/signal.h> #include <linux/watch_queue.h> #include <linux/pipe_fs_i.h> MODULE_DESCRIPTION("Watch queue"); MODULE_AUTHOR("Red Hat, Inc."); #define WATCH_QUEUE_NOTE_SIZE 128 #define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) /* * This must be called under the RCU read-lock, which makes * sure that the wqueue still exists. It can then take the lock, * and check that the wqueue hasn't been destroyed, which in * turn makes sure that the notification pipe still exists. */ static inline bool lock_wqueue(struct watch_queue *wqueue) { spin_lock_bh(&wqueue->lock); if (unlikely(!wqueue->pipe)) { spin_unlock_bh(&wqueue->lock); return false; } return true; } static inline void unlock_wqueue(struct watch_queue *wqueue) { spin_unlock_bh(&wqueue->lock); } static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct watch_queue *wqueue = (struct watch_queue *)buf->private; struct page *page; unsigned int bit; /* We need to work out which note within the page this refers to, but * the note might have been maximum size, so merely ANDing the offset * off doesn't work. OTOH, the note must've been more than zero size. */ bit = buf->offset + buf->len; if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0) bit -= WATCH_QUEUE_NOTE_SIZE; bit /= WATCH_QUEUE_NOTE_SIZE; page = buf->page; bit += page->index; set_bit(bit, wqueue->notes_bitmap); generic_pipe_buf_release(pipe, buf); } // No try_steal function => no stealing #define watch_queue_pipe_buf_try_steal NULL /* New data written to a pipe may be appended to a buffer with this type. */ static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { .release = watch_queue_pipe_buf_release, .try_steal = watch_queue_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; /* * Post a notification to a watch queue. * * Must be called with the RCU lock for reading, and the * watch_queue lock held, which guarantees that the pipe * hasn't been released. */ static bool post_one_notification(struct watch_queue *wqueue, struct watch_notification *n) { void *p; struct pipe_inode_info *pipe = wqueue->pipe; struct pipe_buffer *buf; struct page *page; unsigned int head, tail, mask, note, offset, len; bool done = false; spin_lock_irq(&pipe->rd_wait.lock); mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; if (pipe_full(head, tail, pipe->ring_size)) goto lost; note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes); if (note >= wqueue->nr_notes) goto lost; page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE]; offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; get_page(page); len = n->info & WATCH_INFO_LENGTH; p = kmap_atomic(page); memcpy(p + offset, n, len); kunmap_atomic(p); buf = &pipe->bufs[head & mask]; buf->page = page; buf->private = (unsigned long)wqueue; buf->ops = &watch_queue_pipe_buf_ops; buf->offset = offset; buf->len = len; buf->flags = PIPE_BUF_FLAG_WHOLE; smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */ if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { spin_unlock_irq(&pipe->rd_wait.lock); BUG(); } wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); done = true; out: spin_unlock_irq(&pipe->rd_wait.lock); if (done) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); return done; lost: buf = &pipe->bufs[(head - 1) & mask]; buf->flags |= PIPE_BUF_FLAG_LOSS; goto out; } /* * Apply filter rules to a notification. */ static bool filter_watch_notification(const struct watch_filter *wf, const struct watch_notification *n) { const struct watch_type_filter *wt; unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8; unsigned int st_index = n->subtype / st_bits; unsigned int st_bit = 1U << (n->subtype % st_bits); int i; if (!test_bit(n->type, wf->type_filter)) return false; for (i = 0; i < wf->nr_filters; i++) { wt = &wf->filters[i]; if (n->type == wt->type && (wt->subtype_filter[st_index] & st_bit) && (n->info & wt->info_mask) == wt->info_filter) return true; } return false; /* If there is a filter, the default is to reject. */ } /** * __post_watch_notification - Post an event notification * @wlist: The watch list to post the event to. * @n: The notification record to post. * @cred: The creds of the process that triggered the notification. * @id: The ID to match on the watch. * * Post a notification of an event into a set of watch queues and let the users * know. * * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and * should be in units of sizeof(*n). */ void __post_watch_notification(struct watch_list *wlist, struct watch_notification *n, const struct cred *cred, u64 id) { const struct watch_filter *wf; struct watch_queue *wqueue; struct watch *watch; if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) { WARN_ON(1); return; } rcu_read_lock(); hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) { if (watch->id != id) continue; n->info &= ~WATCH_INFO_ID; n->info |= watch->info_id; wqueue = rcu_dereference(watch->queue); wf = rcu_dereference(wqueue->filter); if (wf && !filter_watch_notification(wf, n)) continue; if (security_post_notification(watch->cred, cred, n) < 0) continue; if (lock_wqueue(wqueue)) { post_one_notification(wqueue, n); unlock_wqueue(wqueue); } } rcu_read_unlock(); } EXPORT_SYMBOL(__post_watch_notification); /* * Allocate sufficient pages to preallocation for the requested number of * notifications. */ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) { struct watch_queue *wqueue = pipe->watch_queue; struct page **pages; unsigned long *bitmap; unsigned long user_bufs; int ret, i, nr_pages; if (!wqueue) return -ENODEV; if (wqueue->notes) return -EBUSY; if (nr_notes < 1 || nr_notes > 512) /* TODO: choose a better hard limit */ return -EINVAL; nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1); nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE; user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages); if (nr_pages > pipe->max_usage && (too_many_pipe_buffers_hard(user_bufs) || too_many_pipe_buffers_soft(user_bufs)) && pipe_is_unprivileged_user()) { ret = -EPERM; goto error; } nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes)); if (ret < 0) goto error; ret = -ENOMEM; pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); if (!pages) goto error; for (i = 0; i < nr_pages; i++) { pages[i] = alloc_page(GFP_KERNEL); if (!pages[i]) goto error_p; pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; } bitmap = bitmap_alloc(nr_notes, GFP_KERNEL); if (!bitmap) goto error_p; bitmap_fill(bitmap, nr_notes); wqueue->notes = pages; wqueue->notes_bitmap = bitmap; wqueue->nr_pages = nr_pages; wqueue->nr_notes = nr_notes; return 0; error_p: while (--i >= 0) __free_page(pages[i]); kfree(pages); error: (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted); return ret; } /* * Set the filter on a watch queue. */ long watch_queue_set_filter(struct pipe_inode_info *pipe, struct watch_notification_filter __user *_filter) { struct watch_notification_type_filter *tf; struct watch_notification_filter filter; struct watch_type_filter *q; struct watch_filter *wfilter; struct watch_queue *wqueue = pipe->watch_queue; int ret, nr_filter = 0, i; if (!wqueue) return -ENODEV; if (!_filter) { /* Remove the old filter */ wfilter = NULL; goto set; } /* Grab the user's filter specification */ if (copy_from_user(&filter, _filter, sizeof(filter)) != 0) return -EFAULT; if (filter.nr_filters == 0 || filter.nr_filters > 16 || filter.__reserved != 0) return -EINVAL; tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf)); if (IS_ERR(tf)) return PTR_ERR(tf); ret = -EINVAL; for (i = 0; i < filter.nr_filters; i++) { if ((tf[i].info_filter & ~tf[i].info_mask) || tf[i].info_mask & WATCH_INFO_LENGTH) goto err_filter; /* Ignore any unknown types */ if (tf[i].type >= WATCH_TYPE__NR) continue; nr_filter++; } /* Now we need to build the internal filter from only the relevant * user-specified filters. */ ret = -ENOMEM; wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL); if (!wfilter) goto err_filter; wfilter->nr_filters = nr_filter; q = wfilter->filters; for (i = 0; i < filter.nr_filters; i++) { if (tf[i].type >= WATCH_TYPE__NR) continue; q->type = tf[i].type; q->info_filter = tf[i].info_filter; q->info_mask = tf[i].info_mask; q->subtype_filter[0] = tf[i].subtype_filter[0]; __set_bit(q->type, wfilter->type_filter); q++; } kfree(tf); set: pipe_lock(pipe); wfilter = rcu_replace_pointer(wqueue->filter, wfilter, lockdep_is_held(&pipe->mutex)); pipe_unlock(pipe); if (wfilter) kfree_rcu(wfilter, rcu); return 0; err_filter: kfree(tf); return ret; } static void __put_watch_queue(struct kref *kref) { struct watch_queue *wqueue = container_of(kref, struct watch_queue, usage); struct watch_filter *wfilter; int i; for (i = 0; i < wqueue->nr_pages; i++) __free_page(wqueue->notes[i]); kfree(wqueue->notes); bitmap_free(wqueue->notes_bitmap); wfilter = rcu_access_pointer(wqueue->filter); if (wfilter) kfree_rcu(wfilter, rcu); kfree_rcu(wqueue, rcu); } /** * put_watch_queue - Dispose of a ref on a watchqueue. * @wqueue: The watch queue to unref. */ void put_watch_queue(struct watch_queue *wqueue) { kref_put(&wqueue->usage, __put_watch_queue); } EXPORT_SYMBOL(put_watch_queue); static void free_watch(struct rcu_head *rcu) { struct watch *watch = container_of(rcu, struct watch, rcu); put_watch_queue(rcu_access_pointer(watch->queue)); atomic_dec(&watch->cred->user->nr_watches); put_cred(watch->cred); kfree(watch); } static void __put_watch(struct kref *kref) { struct watch *watch = container_of(kref, struct watch, usage); call_rcu(&watch->rcu, free_watch); } /* * Discard a watch. */ static void put_watch(struct watch *watch) { kref_put(&watch->usage, __put_watch); } /** * init_watch - Initialise a watch * @watch: The watch to initialise. * @wqueue: The queue to assign. * * Initialise a watch and set the watch queue. */ void init_watch(struct watch *watch, struct watch_queue *wqueue) { kref_init(&watch->usage); INIT_HLIST_NODE(&watch->list_node); INIT_HLIST_NODE(&watch->queue_node); rcu_assign_pointer(watch->queue, wqueue); } static int add_one_watch(struct watch *watch, struct watch_list *wlist, struct watch_queue *wqueue) { const struct cred *cred; struct watch *w; hlist_for_each_entry(w, &wlist->watchers, list_node) { struct watch_queue *wq = rcu_access_pointer(w->queue); if (wqueue == wq && watch->id == w->id) return -EBUSY; } cred = current_cred(); if (atomic_inc_return(&cred->user->nr_watches) > task_rlimit(current, RLIMIT_NOFILE)) { atomic_dec(&cred->user->nr_watches); return -EAGAIN; } watch->cred = get_cred(cred); rcu_assign_pointer(watch->watch_list, wlist); kref_get(&wqueue->usage); kref_get(&watch->usage); hlist_add_head(&watch->queue_node, &wqueue->watches); hlist_add_head_rcu(&watch->list_node, &wlist->watchers); return 0; } /** * add_watch_to_object - Add a watch on an object to a watch list * @watch: The watch to add * @wlist: The watch list to add to * * @watch->queue must have been set to point to the queue to post notifications * to and the watch list of the object to be watched. @watch->cred must also * have been set to the appropriate credentials and a ref taken on them. * * The caller must pin the queue and the list both and must hold the list * locked against racing watch additions/removals. */ int add_watch_to_object(struct watch *watch, struct watch_list *wlist) { struct watch_queue *wqueue; int ret = -ENOENT; rcu_read_lock(); wqueue = rcu_access_pointer(watch->queue); if (lock_wqueue(wqueue)) { spin_lock(&wlist->lock); ret = add_one_watch(watch, wlist, wqueue); spin_unlock(&wlist->lock); unlock_wqueue(wqueue); } rcu_read_unlock(); return ret; } EXPORT_SYMBOL(add_watch_to_object); /** * remove_watch_from_object - Remove a watch or all watches from an object. * @wlist: The watch list to remove from * @wq: The watch queue of interest (ignored if @all is true) * @id: The ID of the watch to remove (ignored if @all is true) * @all: True to remove all objects * * Remove a specific watch or all watches from an object. A notification is * sent to the watcher to tell them that this happened. */ int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq, u64 id, bool all) { struct watch_notification_removal n; struct watch_queue *wqueue; struct watch *watch; int ret = -EBADSLT; rcu_read_lock(); again: spin_lock(&wlist->lock); hlist_for_each_entry(watch, &wlist->watchers, list_node) { if (all || (watch->id == id && rcu_access_pointer(watch->queue) == wq)) goto found; } spin_unlock(&wlist->lock); goto out; found: ret = 0; hlist_del_init_rcu(&watch->list_node); rcu_assign_pointer(watch->watch_list, NULL); spin_unlock(&wlist->lock); /* We now own the reference on watch that used to belong to wlist. */ n.watch.type = WATCH_TYPE_META; n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION; n.watch.info = watch->info_id | watch_sizeof(n.watch); n.id = id; if (id != 0) n.watch.info = watch->info_id | watch_sizeof(n); wqueue = rcu_dereference(watch->queue); if (lock_wqueue(wqueue)) { post_one_notification(wqueue, &n.watch); if (!hlist_unhashed(&watch->queue_node)) { hlist_del_init_rcu(&watch->queue_node); put_watch(watch); } unlock_wqueue(wqueue); } if (wlist->release_watch) { void (*release_watch)(struct watch *); release_watch = wlist->release_watch; rcu_read_unlock(); (*release_watch)(watch); rcu_read_lock(); } put_watch(watch); if (all && !hlist_empty(&wlist->watchers)) goto again; out: rcu_read_unlock(); return ret; } EXPORT_SYMBOL(remove_watch_from_object); /* * Remove all the watches that are contributory to a queue. This has the * potential to race with removal of the watches by the destruction of the * objects being watched or with the distribution of notifications. */ void watch_queue_clear(struct watch_queue *wqueue) { struct watch_list *wlist; struct watch *watch; bool release; rcu_read_lock(); spin_lock_bh(&wqueue->lock); /* * This pipe can be freed by callers like free_pipe_info(). * Removing this reference also prevents new notifications. */ wqueue->pipe = NULL; while (!hlist_empty(&wqueue->watches)) { watch = hlist_entry(wqueue->watches.first, struct watch, queue_node); hlist_del_init_rcu(&watch->queue_node); /* We now own a ref on the watch. */ spin_unlock_bh(&wqueue->lock); /* We can't do the next bit under the queue lock as we need to * get the list lock - which would cause a deadlock if someone * was removing from the opposite direction at the same time or * posting a notification. */ wlist = rcu_dereference(watch->watch_list); if (wlist) { void (*release_watch)(struct watch *); spin_lock(&wlist->lock); release = !hlist_unhashed(&watch->list_node); if (release) { hlist_del_init_rcu(&watch->list_node); rcu_assign_pointer(watch->watch_list, NULL); /* We now own a second ref on the watch. */ } release_watch = wlist->release_watch; spin_unlock(&wlist->lock); if (release) { if (release_watch) { rcu_read_unlock(); /* This might need to call dput(), so * we have to drop all the locks. */ (*release_watch)(watch); rcu_read_lock(); } put_watch(watch); } } put_watch(watch); spin_lock_bh(&wqueue->lock); } spin_unlock_bh(&wqueue->lock); rcu_read_unlock(); } /** * get_watch_queue - Get a watch queue from its file descriptor. * @fd: The fd to query. */ struct watch_queue *get_watch_queue(int fd) { struct pipe_inode_info *pipe; struct watch_queue *wqueue = ERR_PTR(-EINVAL); struct fd f; f = fdget(fd); if (f.file) { pipe = get_pipe_info(f.file, false); if (pipe && pipe->watch_queue) { wqueue = pipe->watch_queue; kref_get(&wqueue->usage); } fdput(f); } return wqueue; } EXPORT_SYMBOL(get_watch_queue); /* * Initialise a watch queue */ int watch_queue_init(struct pipe_inode_info *pipe) { struct watch_queue *wqueue; wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL); if (!wqueue) return -ENOMEM; wqueue->pipe = pipe; kref_init(&wqueue->usage); spin_lock_init(&wqueue->lock); INIT_HLIST_HEAD(&wqueue->watches); pipe->watch_queue = wqueue; return 0; }
271 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_BARRIER_H #define _ASM_X86_BARRIER_H #include <asm/alternative.h> #include <asm/nops.h> /* * Force strict CPU ordering. * And yes, this might be required on UP too when we're talking * to devices. */ #ifdef CONFIG_X86_32 #define mb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "mfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") #define rmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "lfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") #define wmb() asm volatile(ALTERNATIVE("lock; addl $0,-4(%%esp)", "sfence", \ X86_FEATURE_XMM2) ::: "memory", "cc") #else #define __mb() asm volatile("mfence":::"memory") #define __rmb() asm volatile("lfence":::"memory") #define __wmb() asm volatile("sfence" ::: "memory") #endif /** * array_index_mask_nospec() - generate a mask that is ~0UL when the * bounds check succeeds and 0 otherwise * @index: array element index * @size: number of elements in array * * Returns: * 0 - (index < size) */ static inline unsigned long array_index_mask_nospec(unsigned long index, unsigned long size) { unsigned long mask; asm volatile ("cmp %1,%2; sbb %0,%0;" :"=r" (mask) :"g"(size),"r" (index) :"cc"); return mask; } /* Override the default implementation from linux/nospec.h. */ #define array_index_mask_nospec array_index_mask_nospec /* Prevent speculative execution past this barrier. */ #define barrier_nospec() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC) #define __dma_rmb() barrier() #define __dma_wmb() barrier() #define __smp_mb() asm volatile("lock; addl $0,-4(%%" _ASM_SP ")" ::: "memory", "cc") #define __smp_rmb() dma_rmb() #define __smp_wmb() barrier() #define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) #define __smp_store_release(p, v) \ do { \ compiletime_assert_atomic_type(*p); \ barrier(); \ WRITE_ONCE(*p, v); \ } while (0) #define __smp_load_acquire(p) \ ({ \ typeof(*p) ___p1 = READ_ONCE(*p); \ compiletime_assert_atomic_type(*p); \ barrier(); \ ___p1; \ }) /* Atomic operations are already serializing on x86 */ #define __smp_mb__before_atomic() do { } while (0) #define __smp_mb__after_atomic() do { } while (0) #include <asm-generic/barrier.h> /* * Make previous memory operations globally visible before * a WRMSR. * * MFENCE makes writes visible, but only affects load/store * instructions. WRMSR is unfortunately not a load/store * instruction and is unaffected by MFENCE. The LFENCE ensures * that the WRMSR is not reordered. * * Most WRMSRs are full serializing instructions themselves and * do not require this barrier. This is only required for the * IA32_TSC_DEADLINE and X2APIC MSRs. */ static inline void weak_wrmsr_fence(void) { asm volatile("mfence; lfence" : : : "memory"); } #endif /* _ASM_X86_BARRIER_H */
5 5 5 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/super.c * * Copyright (C) 1991, 1992 Linus Torvalds * * super.c contains code to handle: - mount structures * - super-block tables * - filesystem drivers list * - mount system call * - umount system call * - ustat system call * * GK 2/5/95 - Changed to support mounting the root fs via NFS * * Added kerneld support: Jacques Gelinas and Bjorn Ekwall * Added change_root: Werner Almesberger & Hans Lermen, Feb '96 * Added options to /proc/mounts: * Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996. * Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998 * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 */ #include <linux/export.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/writeback.h> /* for the emergency remount stuff */ #include <linux/idr.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/rculist_bl.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/lockdep.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <uapi/linux/mount.h> #include "internal.h" static int thaw_super_locked(struct super_block *sb, enum freeze_holder who); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_writers", "sb_pagefaults", "sb_internal", }; static inline void __super_lock(struct super_block *sb, bool excl) { if (excl) down_write(&sb->s_umount); else down_read(&sb->s_umount); } static inline void super_unlock(struct super_block *sb, bool excl) { if (excl) up_write(&sb->s_umount); else up_read(&sb->s_umount); } static inline void __super_lock_excl(struct super_block *sb) { __super_lock(sb, true); } static inline void super_unlock_excl(struct super_block *sb) { super_unlock(sb, true); } static inline void super_unlock_shared(struct super_block *sb) { super_unlock(sb, false); } static inline bool wait_born(struct super_block *sb) { unsigned int flags; /* * Pairs with smp_store_release() in super_wake() and ensures * that we see SB_BORN or SB_DYING after we're woken. */ flags = smp_load_acquire(&sb->s_flags); return flags & (SB_BORN | SB_DYING); } /** * super_lock - wait for superblock to become ready and lock it * @sb: superblock to wait for * @excl: whether exclusive access is required * * If the superblock has neither passed through vfs_get_tree() or * generic_shutdown_super() yet wait for it to happen. Either superblock * creation will succeed and SB_BORN is set by vfs_get_tree() or we're * woken and we'll see SB_DYING. * * The caller must have acquired a temporary reference on @sb->s_count. * * Return: This returns true if SB_BORN was set, false if SB_DYING was * set. The function acquires s_umount and returns with it held. */ static __must_check bool super_lock(struct super_block *sb, bool excl) { lockdep_assert_not_held(&sb->s_umount); relock: __super_lock(sb, excl); /* * Has gone through generic_shutdown_super() in the meantime. * @sb->s_root is NULL and @sb->s_active is 0. No one needs to * grab a reference to this. Tell them so. */ if (sb->s_flags & SB_DYING) return false; /* Has called ->get_tree() successfully. */ if (sb->s_flags & SB_BORN) return true; super_unlock(sb, excl); /* wait until the superblock is ready or dying */ wait_var_event(&sb->s_flags, wait_born(sb)); /* * Neither SB_BORN nor SB_DYING are ever unset so we never loop. * Just reacquire @sb->s_umount for the caller. */ goto relock; } /* wait and acquire read-side of @sb->s_umount */ static inline bool super_lock_shared(struct super_block *sb) { return super_lock(sb, false); } /* wait and acquire write-side of @sb->s_umount */ static inline bool super_lock_excl(struct super_block *sb) { return super_lock(sb, true); } /* wake waiters */ #define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD) static void super_wake(struct super_block *sb, unsigned int flag) { WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS)); WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1); /* * Pairs with smp_load_acquire() in super_lock() to make sure * all initializations in the superblock are seen by the user * seeing SB_BORN sent. */ smp_store_release(&sb->s_flags, sb->s_flags | flag); /* * Pairs with the barrier in prepare_to_wait_event() to make sure * ___wait_var_event() either sees SB_BORN set or * waitqueue_active() check in wake_up_var() sees the waiter. */ smp_mb(); wake_up_var(&sb->s_flags); } /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long fs_objects = 0; long total_objects; long freed = 0; long dentries; long inodes; sb = container_of(shrink, struct super_block, s_shrink); /* * Deadlock avoidance. We may hold various FS locks, and we don't want * to recurse into the FS that called us in clear_inode() and friends.. */ if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; if (!super_trylock_shared(sb)) return SHRINK_STOP; if (sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb, sc); inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects = dentries + inodes + fs_objects + 1; if (!total_objects) total_objects = 1; /* proportion the scan between the caches */ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); /* * prune the dcache first as the icache is pinned by it, then * prune the icache, followed by the filesystem specific caches * * Ensure that we always scan at least one object - memcg kmem * accounting uses this to fully empty the caches. */ sc->nr_to_scan = dentries + 1; freed = prune_dcache_sb(sb, sc); sc->nr_to_scan = inodes + 1; freed += prune_icache_sb(sb, sc); if (fs_objects) { sc->nr_to_scan = fs_objects + 1; freed += sb->s_op->free_cached_objects(sb, sc); } super_unlock_shared(sb); return freed; } static unsigned long super_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long total_objects = 0; sb = container_of(shrink, struct super_block, s_shrink); /* * We don't call super_trylock_shared() here as it is a scalability * bottleneck, so we're exposed to partial setup state. The shrinker * rwsem does not protect filesystem operations backing * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can * change between super_cache_count and super_cache_scan, so we really * don't need locks here. * * However, if we are currently mounting the superblock, the underlying * filesystem might be in a state of partial construction and hence it * is dangerous to access it. super_trylock_shared() uses a SB_BORN check * to avoid this situation, so do the same here. The memory barrier is * matched with the one in mount_fs() as we don't hold locks here. */ if (!(sb->s_flags & SB_BORN)) return 0; smp_rmb(); if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc); total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); if (!total_objects) return SHRINK_EMPTY; total_objects = vfs_pressure_ratio(total_objects); return total_objects; } static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, destroy_work); int i; for (i = 0; i < SB_FREEZE_LEVELS; i++) percpu_free_rwsem(&s->s_writers.rw_sem[i]); kfree(s); } static void destroy_super_rcu(struct rcu_head *head) { struct super_block *s = container_of(head, struct super_block, rcu); INIT_WORK(&s->destroy_work, destroy_super_work); schedule_work(&s->destroy_work); } /* Free a superblock that has never been seen by anyone */ static void destroy_unused_super(struct super_block *s) { if (!s) return; super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); free_prealloced_shrinker(&s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); } /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to * @flags: the mount flags * @user_ns: User namespace for the super_block * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. */ static struct super_block *alloc_super(struct file_system_type *type, int flags, struct user_namespace *user_ns) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); static const struct super_operations default_op; int i; if (!s) return NULL; INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); /* * sget() can have s_umount recursion. * * When it cannot find a suitable sb, it allocates a new * one (this one), and tries again to find a suitable old * one. * * In case that succeeds, it will acquire the s_umount * lock of the old one. Since these are clearly distrinct * locks, and this object isn't exposed yet, there's no * risk of deadlocks. * * Annotate this by putting this lock in a different * subclass. */ down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); if (security_sb_alloc(s)) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], sb_writers_name[i], &type->s_writers_key[i])) goto fail; } s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; if (s->s_user_ns != &init_user_ns) s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_roots); mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); spin_lock_init(&s->s_inode_list_lock); INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); init_rwsem(&s->s_dquot.dqio_sem); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; s->s_shrink.seeks = DEFAULT_SEEKS; s->s_shrink.scan_objects = super_cache_scan; s->s_shrink.count_objects = super_cache_count; s->s_shrink.batch = 1024; s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name)) goto fail; if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink)) goto fail; if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink)) goto fail; return s; fail: destroy_unused_super(s); return NULL; } /* Superblock refcounting */ /* * Drop a superblock's refcount. The caller must hold sb_lock. */ static void __put_super(struct super_block *s) { if (!--s->s_count) { list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); } } /** * put_super - drop a temporary reference to superblock * @sb: superblock in question * * Drops a temporary reference, frees superblock if there's no * references left. */ void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); spin_unlock(&sb_lock); } static void kill_super_notify(struct super_block *sb) { lockdep_assert_not_held(&sb->s_umount); /* already notified earlier */ if (sb->s_flags & SB_DEAD) return; /* * Remove it from @fs_supers so it isn't found by new * sget{_fc}() walkers anymore. Any concurrent mounter still * managing to grab a temporary reference is guaranteed to * already see SB_DYING and will wait until we notify them about * SB_DEAD. */ spin_lock(&sb_lock); hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); /* * Let concurrent mounts know that this thing is really dead. * We don't need @sb->s_umount here as every concurrent caller * will see SB_DYING and either discard the superblock or wait * for SB_DEAD. */ super_wake(sb, SB_DEAD); } /** * deactivate_locked_super - drop an active reference to superblock * @s: superblock to deactivate * * Drops an active reference to superblock, converting it into a temporary * one if there is no other active references left. In that case we * tell fs driver to shut it down and drop the temporary reference we * had just acquired. * * Caller holds exclusive lock on superblock; that lock is released. */ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { unregister_shrinker(&s->s_shrink); fs->kill_sb(s); kill_super_notify(s); /* * Since list_lru_destroy() may sleep, we cannot call it from * put_super(), where we hold the sb_lock. Therefore we destroy * the lru lists right now. */ list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); put_filesystem(fs); put_super(s); } else { super_unlock_excl(s); } } EXPORT_SYMBOL(deactivate_locked_super); /** * deactivate_super - drop an active reference to superblock * @s: superblock to deactivate * * Variant of deactivate_locked_super(), except that superblock is *not* * locked by caller. If we are going to drop the final active reference, * lock will be acquired prior to that. */ void deactivate_super(struct super_block *s) { if (!atomic_add_unless(&s->s_active, -1, 1)) { __super_lock_excl(s); deactivate_locked_super(s); } } EXPORT_SYMBOL(deactivate_super); /** * grab_super - acquire an active reference * @s: reference we are trying to make active * * Tries to acquire an active reference. grab_super() is used when we * had just found a superblock in super_blocks or fs_type->fs_supers * and want to turn it into a full-blown active reference. grab_super() * is called with sb_lock held and drops it. Returns 1 in case of * success, 0 if we had failed (superblock contents was already dead or * dying when grab_super() had been called). Note that this is only * called for superblocks not in rundown mode (== ones still on ->fs_supers * of their type), so increment of ->s_count is OK here. */ static int grab_super(struct super_block *s) __releases(sb_lock) { bool born; s->s_count++; spin_unlock(&sb_lock); born = super_lock_excl(s); if (born && atomic_inc_not_zero(&s->s_active)) { put_super(s); return 1; } super_unlock_excl(s); put_super(s); return 0; } static inline bool wait_dead(struct super_block *sb) { unsigned int flags; /* * Pairs with memory barrier in super_wake() and ensures * that we see SB_DEAD after we're woken. */ flags = smp_load_acquire(&sb->s_flags); return flags & SB_DEAD; } /** * grab_super_dead - acquire an active reference to a superblock * @sb: superblock to acquire * * Acquire a temporary reference on a superblock and try to trade it for * an active reference. This is used in sget{_fc}() to wait for a * superblock to either become SB_BORN or for it to pass through * sb->kill() and be marked as SB_DEAD. * * Return: This returns true if an active reference could be acquired, * false if not. */ static bool grab_super_dead(struct super_block *sb) { sb->s_count++; if (grab_super(sb)) { put_super(sb); lockdep_assert_held(&sb->s_umount); return true; } wait_var_event(&sb->s_flags, wait_dead(sb)); lockdep_assert_not_held(&sb->s_umount); put_super(sb); return false; } /* * super_trylock_shared - try to grab ->s_umount shared * @sb: reference we are trying to grab * * Try to prevent fs shutdown. This is used in places where we * cannot take an active reference but we need to ensure that the * filesystem is not shut down while we are working on it. It returns * false if we cannot acquire s_umount or if we lose the race and * filesystem already got into shutdown, and returns true with the s_umount * lock held in read mode in case of success. On successful return, * the caller must drop the s_umount lock when done. * * Note that unlike get_super() et.al. this one does *not* bump ->s_count. * The reason why it's safe is that we are OK with doing trylock instead * of down_read(). There's a couple of places that are OK with that, but * it's very much not a general-purpose interface. */ bool super_trylock_shared(struct super_block *sb) { if (down_read_trylock(&sb->s_umount)) { if (!(sb->s_flags & SB_DYING) && sb->s_root && (sb->s_flags & SB_BORN)) return true; super_unlock_shared(sb); } return false; } /** * retire_super - prevents superblock from being reused * @sb: superblock to retire * * The function marks superblock to be ignored in superblock test, which * prevents it from being reused for any new mounts. If the superblock has * a private bdi, it also unregisters it, but doesn't reduce the refcount * of the superblock to prevent potential races. The refcount is reduced * by generic_shutdown_super(). The function can not be called * concurrently with generic_shutdown_super(). It is safe to call the * function multiple times, subsequent calls have no effect. * * The marker will affect the re-use only for block-device-based * superblocks. Other superblocks will still get marked if this function * is used, but that will not affect their reusability. */ void retire_super(struct super_block *sb) { WARN_ON(!sb->s_bdev); __super_lock_excl(sb); if (sb->s_iflags & SB_I_PERSB_BDI) { bdi_unregister(sb->s_bdi); sb->s_iflags &= ~SB_I_PERSB_BDI; } sb->s_iflags |= SB_I_RETIRED; super_unlock_excl(sb); } EXPORT_SYMBOL(retire_super); /** * generic_shutdown_super - common helper for ->kill_sb() * @sb: superblock to kill * * generic_shutdown_super() does all fs-independent work on superblock * shutdown. Typical ->kill_sb() should pick all fs-specific objects * that need destruction out of superblock, call generic_shutdown_super() * and release aforementioned objects. Note: dentries and inodes _are_ * taken care of and do not need specific handling. * * Upon calling this function, the filesystem may no longer alter or * rearrange the set of dentries belonging to this super_block, nor may it * change the attachments of dentries to inodes. */ void generic_shutdown_super(struct super_block *sb) { const struct super_operations *sop = sb->s_op; if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; cgroup_writeback_umount(); /* Evict all inodes with zero refcount. */ evict_inodes(sb); /* * Clean up and evict any inodes that still have references due * to fsnotify or the security policy. */ fsnotify_sb_delete(sb); security_sb_delete(sb); /* * Now that all potentially-encrypted inodes have been evicted, * the fscrypt keyring can be destroyed. */ fscrypt_destroy_keyring(sb); if (sb->s_dio_done_wq) { destroy_workqueue(sb->s_dio_done_wq); sb->s_dio_done_wq = NULL; } if (sop->put_super) sop->put_super(sb); if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), "VFS: Busy inodes after unmount of %s (%s)", sb->s_id, sb->s_type->name)) { /* * Adding a proper bailout path here would be hard, but * we can at least make it more likely that a later * iput_final() or such crashes cleanly. */ struct inode *inode; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { inode->i_op = VFS_PTR_POISON; inode->i_sb = VFS_PTR_POISON; inode->i_mapping = VFS_PTR_POISON; } spin_unlock(&sb->s_inode_list_lock); } } /* * Broadcast to everyone that grabbed a temporary reference to this * superblock before we removed it from @fs_supers that the superblock * is dying. Every walker of @fs_supers outside of sget{_fc}() will now * discard this superblock and treat it as dead. * * We leave the superblock on @fs_supers so it can be found by * sget{_fc}() until we passed sb->kill_sb(). */ super_wake(sb, SB_DYING); super_unlock_excl(sb); if (sb->s_bdi != &noop_backing_dev_info) { if (sb->s_iflags & SB_I_PERSB_BDI) bdi_unregister(sb->s_bdi); bdi_put(sb->s_bdi); sb->s_bdi = &noop_backing_dev_info; } } EXPORT_SYMBOL(generic_shutdown_super); bool mount_capable(struct fs_context *fc) { if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) return capable(CAP_SYS_ADMIN); else return ns_capable(fc->user_ns, CAP_SYS_ADMIN); } /** * sget_fc - Find or create a superblock * @fc: Filesystem context. * @test: Comparison callback * @set: Setup callback * * Create a new superblock or find an existing one. * * The @test callback is used to find a matching existing superblock. * Whether or not the requested parameters in @fc are taken into account * is specific to the @test callback that is used. They may even be * completely ignored. * * If an extant superblock is matched, it will be returned unless: * * (1) the namespace the filesystem context @fc and the extant * superblock's namespace differ * * (2) the filesystem context @fc has requested that reusing an extant * superblock is not allowed * * In both cases EBUSY will be returned. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info and s_id will be * set and the @set callback will be invoked), the superblock will be * published and it will be returned in a partially constructed state * with SB_BORN and SB_ACTIVE as yet unset. * * Return: On success, an extant or newly created superblock is * returned. On failure an error pointer is returned. */ struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)) { struct super_block *s = NULL; struct super_block *old; struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; int err; retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { if (test(old, fc)) goto share_extant_sb; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(fc->fs_type, fc->sb_flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } s->s_fs_info = fc->s_fs_info; err = set(s, fc); if (err) { s->s_fs_info = NULL; spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } fc->s_fs_info = NULL; s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); /* * Make the superblock visible on @super_blocks and @fs_supers. * It's in a nascent state and users should wait on SB_BORN or * SB_DYING to be set. */ list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); get_filesystem(s->s_type); register_shrinker_prepared(&s->s_shrink); return s; share_extant_sb: if (user_ns != old->s_user_ns || fc->exclusive) { spin_unlock(&sb_lock); destroy_unused_super(s); if (fc->exclusive) warnfc(fc, "reusing existing filesystem not allowed"); else warnfc(fc, "reusing existing filesystem in another namespace not allowed"); return ERR_PTR(-EBUSY); } if (!grab_super_dead(old)) goto retry; destroy_unused_super(s); return old; } EXPORT_SYMBOL(sget_fc); /** * sget - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback * @flags: mount flags * @data: argument to each of them */ struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data) { struct user_namespace *user_ns = current_user_ns(); struct super_block *s = NULL; struct super_block *old; int err; /* We don't yet pass the user namespace of the parent * mount through to here so always use &init_user_ns * until that changes. */ if (flags & SB_SUBMOUNT) user_ns = &init_user_ns; retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (user_ns != old->s_user_ns) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(-EBUSY); } if (!grab_super_dead(old)) goto retry; destroy_unused_super(s); return old; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } err = set(s, data); if (err) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } s->s_type = type; strscpy(s->s_id, type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); register_shrinker_prepared(&s->s_shrink); return s; } EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) { super_unlock_shared(sb); put_super(sb); } EXPORT_SYMBOL(drop_super); void drop_super_exclusive(struct super_block *sb) { super_unlock_excl(sb); put_super(sb); } EXPORT_SYMBOL(drop_super_exclusive); static void __iterate_supers(void (*f)(struct super_block *)) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { /* Pairs with memory marrier in super_wake(). */ if (smp_load_acquire(&sb->s_flags) & SB_DYING) continue; sb->s_count++; spin_unlock(&sb_lock); f(sb); spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } /** * iterate_supers - call function for all active superblocks * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { bool born; sb->s_count++; spin_unlock(&sb_lock); born = super_lock_shared(sb); if (born && sb->s_root) f(sb, arg); super_unlock_shared(sb); spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } /** * iterate_supers_type - call function for superblocks of given type * @type: fs type * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers_type(struct file_system_type *type, void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { bool born; sb->s_count++; spin_unlock(&sb_lock); born = super_lock_shared(sb); if (born && sb->s_root) f(sb, arg); super_unlock_shared(sb); spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } EXPORT_SYMBOL(iterate_supers_type); /** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for * * Scans the superblock list and finds the superblock of the file system * mounted on the device given. Returns the superblock with an active * reference or %NULL if none was found. */ struct super_block *get_active_super(struct block_device *bdev) { struct super_block *sb; if (!bdev) return NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_bdev == bdev) { if (!grab_super(sb)) return NULL; super_unlock_excl(sb); return sb; } } spin_unlock(&sb_lock); return NULL; } struct super_block *user_get_super(dev_t dev, bool excl) { struct super_block *sb; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_dev == dev) { bool born; sb->s_count++; spin_unlock(&sb_lock); /* still alive? */ born = super_lock(sb, excl); if (born && sb->s_root) return sb; super_unlock(sb, excl); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); break; } } spin_unlock(&sb_lock); return NULL; } /** * reconfigure_super - asks filesystem to change superblock parameters * @fc: The superblock and configuration * * Alters the configuration parameters of a live superblock. */ int reconfigure_super(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int retval; bool remount_ro = false; bool remount_rw = false; bool force = fc->sb_flags & SB_FORCE; if (fc->sb_flags_mask & ~MS_RMT_MASK) return -EINVAL; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; retval = security_sb_remount(sb, fc->security); if (retval) return retval; if (fc->sb_flags_mask & SB_RDONLY) { #ifdef CONFIG_BLOCK if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev && bdev_read_only(sb->s_bdev)) return -EACCES; #endif remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb); remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); } if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { super_unlock_excl(sb); group_pin_kill(&sb->s_pins); __super_lock_excl(sb); if (!sb->s_root) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; remount_ro = !sb_rdonly(sb); } } shrink_dcache_sb(sb); /* If we are reconfiguring to RDONLY and current sb is read/write, * make sure there are no files open for writing. */ if (remount_ro) { if (force) { sb_start_ro_state_change(sb); } else { retval = sb_prepare_remount_readonly(sb); if (retval) return retval; } } else if (remount_rw) { /* * Protect filesystem's reconfigure code from writes from * userspace until reconfigure finishes. */ sb_start_ro_state_change(sb); } if (fc->ops->reconfigure) { retval = fc->ops->reconfigure(fc); if (retval) { if (!force) goto cancel_readonly; /* If forced remount, go ahead despite any errors */ WARN(1, "forced remount of a %s fs returned %i\n", sb->s_type->name, retval); } } WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | (fc->sb_flags & fc->sb_flags_mask))); sb_end_ro_state_change(sb); /* * Some filesystems modify their metadata via some other path than the * bdev buffer cache (eg. use a private mapping, or directories in * pagecache, etc). Also file data modifications go via their own * mappings. So If we try to mount readonly then copy the filesystem * from bdev, we could get stale data, so invalidate it to give a best * effort at coherency. */ if (remount_ro && sb->s_bdev) invalidate_bdev(sb->s_bdev); return 0; cancel_readonly: sb_end_ro_state_change(sb); return retval; } static void do_emergency_remount_callback(struct super_block *sb) { bool born = super_lock_excl(sb); if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY | SB_FORCE, SB_RDONLY); if (!IS_ERR(fc)) { if (parse_monolithic_mount_data(fc, NULL) == 0) (void)reconfigure_super(fc); put_fs_context(fc); } } super_unlock_excl(sb); } static void do_emergency_remount(struct work_struct *work) { __iterate_supers(do_emergency_remount_callback); kfree(work); printk("Emergency Remount complete\n"); } void emergency_remount(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_emergency_remount); schedule_work(work); } } static void do_thaw_all_callback(struct super_block *sb) { bool born = super_lock_excl(sb); if (born && sb->s_root) { if (IS_ENABLED(CONFIG_BLOCK)) while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); } else { super_unlock_excl(sb); } } static void do_thaw_all(struct work_struct *work) { __iterate_supers(do_thaw_all_callback); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } /** * emergency_thaw_all -- forcibly thaw every frozen filesystem * * Used for emergency unfreeze of all filesystems via SysRq */ void emergency_thaw_all(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_thaw_all); schedule_work(work); } } static DEFINE_IDA(unnamed_dev_ida); /** * get_anon_bdev - Allocate a block device for filesystems which don't have one. * @p: Pointer to a dev_t. * * Filesystems which don't use real block devices can call this function * to allocate a virtual block device. * * Context: Any context. Frequently called while holding sb_lock. * Return: 0 on success, -EMFILE if there are no anonymous bdevs left * or -ENOMEM if memory allocation failed. */ int get_anon_bdev(dev_t *p) { int dev; /* * Many userspace utilities consider an FSID of 0 invalid. * Always return at least 1 from get_anon_bdev. */ dev = ida_alloc_range(&unnamed_dev_ida, 1, (1 << MINORBITS) - 1, GFP_ATOMIC); if (dev == -ENOSPC) dev = -EMFILE; if (dev < 0) return dev; *p = MKDEV(0, dev); return 0; } EXPORT_SYMBOL(get_anon_bdev); void free_anon_bdev(dev_t dev) { ida_free(&unnamed_dev_ida, MINOR(dev)); } EXPORT_SYMBOL(free_anon_bdev); int set_anon_super(struct super_block *s, void *data) { return get_anon_bdev(&s->s_dev); } EXPORT_SYMBOL(set_anon_super); void kill_anon_super(struct super_block *sb) { dev_t dev = sb->s_dev; generic_shutdown_super(sb); kill_super_notify(sb); free_anon_bdev(dev); } EXPORT_SYMBOL(kill_anon_super); void kill_litter_super(struct super_block *sb) { if (sb->s_root) d_genocide(sb->s_root); kill_anon_super(sb); } EXPORT_SYMBOL(kill_litter_super); int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) { return set_anon_super(sb, NULL); } EXPORT_SYMBOL(set_anon_super_fc); static int test_keyed_super(struct super_block *sb, struct fs_context *fc) { return sb->s_fs_info == fc->s_fs_info; } static int test_single_super(struct super_block *s, struct fs_context *fc) { return 1; } static int vfs_get_super(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { struct super_block *sb; int err; sb = sget_fc(fc, test, set_anon_super_fc); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { err = fill_super(sb, fc); if (err) goto error; sb->s_flags |= SB_ACTIVE; } fc->root = dget(sb->s_root); return 0; error: deactivate_locked_super(sb); return err; } int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, NULL, fill_super); } EXPORT_SYMBOL(get_tree_nodev); int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single); int get_tree_keyed(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), void *key) { fc->s_fs_info = key; return vfs_get_super(fc, test_keyed_super, fill_super); } EXPORT_SYMBOL(get_tree_keyed); static int set_bdev_super(struct super_block *s, void *data) { s->s_dev = *(dev_t *)data; return 0; } static int super_s_dev_set(struct super_block *s, struct fs_context *fc) { return set_bdev_super(s, fc->sget_key); } static int super_s_dev_test(struct super_block *s, struct fs_context *fc) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)fc->sget_key; } /** * sget_dev - Find or create a superblock by device number * @fc: Filesystem context. * @dev: device number * * Find or create a superblock using the provided device number that * will be stored in fc->sget_key. * * If an extant superblock is matched, then that will be returned with * an elevated reference count that the caller must transfer or discard. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will * be set). The superblock will be published and it will be returned in * a partially constructed state with SB_BORN and SB_ACTIVE as yet * unset. * * Return: an existing or newly created superblock on success, an error * pointer on failure. */ struct super_block *sget_dev(struct fs_context *fc, dev_t dev) { fc->sget_key = &dev; return sget_fc(fc, super_s_dev_test, super_s_dev_set); } EXPORT_SYMBOL(sget_dev); #ifdef CONFIG_BLOCK /* * Lock a super block that the callers holds a reference to. * * The caller needs to ensure that the super_block isn't being freed while * calling this function, e.g. by holding a lock over the call to this function * and the place that clears the pointer to the superblock used by this function * before freeing the superblock. */ static bool super_lock_shared_active(struct super_block *sb) { bool born = super_lock_shared(sb); if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) { super_unlock_shared(sb); return false; } return true; } static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { struct super_block *sb = bdev->bd_holder; /* bd_holder_lock ensures that the sb isn't freed */ lockdep_assert_held(&bdev->bd_holder_lock); if (!super_lock_shared_active(sb)) return; if (!surprise) sync_filesystem(sb); shrink_dcache_sb(sb); invalidate_inodes(sb); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); super_unlock_shared(sb); } static void fs_bdev_sync(struct block_device *bdev) { struct super_block *sb = bdev->bd_holder; lockdep_assert_held(&bdev->bd_holder_lock); if (!super_lock_shared_active(sb)) return; sync_filesystem(sb); super_unlock_shared(sb); } const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, }; EXPORT_SYMBOL_GPL(fs_holder_ops); int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) { blk_mode_t mode = sb_open_mode(sb_flags); struct block_device *bdev; bdev = blkdev_get_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); if (IS_ERR(bdev)) { if (fc) errorf(fc, "%s: Can't open blockdev", fc->source); return PTR_ERR(bdev); } /* * This really should be in blkdev_get_by_dev, but right now can't due * to legacy issues that require us to allow opening a block device node * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { blkdev_put(bdev, sb); return -EACCES; } /* * Until SB_BORN flag is set, there can be no active superblock * references and thus no filesystem freezing. get_active_super() will * just loop waiting for SB_BORN so even freeze_bdev() cannot proceed. * * It is enough to check bdev was not frozen before we set s_bdev. */ mutex_lock(&bdev->bd_fsfreeze_mutex); if (bdev->bd_fsfreeze_count > 0) { mutex_unlock(&bdev->bd_fsfreeze_mutex); if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); blkdev_put(bdev, sb); return -EBUSY; } spin_lock(&sb_lock); sb->s_bdev = bdev; sb->s_bdi = bdi_get(bdev->bd_disk->bdi); if (bdev_stable_writes(bdev)) sb->s_iflags |= SB_I_STABLE_WRITES; spin_unlock(&sb_lock); mutex_unlock(&bdev->bd_fsfreeze_mutex); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(&sb->s_shrink, "sb-%s:%s", sb->s_type->name, sb->s_id); sb_set_blocksize(sb, block_size(bdev)); return 0; } EXPORT_SYMBOL_GPL(setup_bdev_super); /** * get_tree_bdev - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock */ int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *, struct fs_context *)) { struct super_block *s; int error = 0; dev_t dev; if (!fc->source) return invalf(fc, "No source specified"); error = lookup_bdev(fc->source, &dev); if (error) { errorf(fc, "%s: Can't lookup blockdev", fc->source); return error; } fc->sb_flags |= SB_NOSEC; s = sget_dev(fc, dev); if (IS_ERR(s)) return PTR_ERR(s); if (s->s_root) { /* Don't summarily change the RO/RW state. */ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev); deactivate_locked_super(s); return -EBUSY; } } else { /* * We drop s_umount here because we need to open the bdev and * bdev->open_mutex ranks above s_umount (blkdev_put() -> * bdev_mark_dead()). It is safe because we have active sb * reference and SB_BORN is not set yet. */ super_unlock_excl(s); error = setup_bdev_super(s, fc->sb_flags, fc); __super_lock_excl(s); if (!error) error = fill_super(s, fc); if (error) { deactivate_locked_super(s); return error; } s->s_flags |= SB_ACTIVE; } BUG_ON(fc->root); fc->root = dget(s->s_root); return 0; } EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; } struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; dev_t dev; error = lookup_bdev(dev_name, &dev); if (error) return ERR_PTR(error); flags |= SB_NOSEC; s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); if (IS_ERR(s)) return ERR_CAST(s); if (s->s_root) { if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); return ERR_PTR(-EBUSY); } } else { /* * We drop s_umount here because we need to open the bdev and * bdev->open_mutex ranks above s_umount (blkdev_put() -> * bdev_mark_dead()). It is safe because we have active sb * reference and SB_BORN is not set yet. */ super_unlock_excl(s); error = setup_bdev_super(s, flags, NULL); __super_lock_excl(s); if (!error) error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; } return dget(s->s_root); } EXPORT_SYMBOL(mount_bdev); void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); blkdev_put(bdev, sb); } } EXPORT_SYMBOL(kill_block_super); #endif struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)) { int error; struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; return dget(s->s_root); } EXPORT_SYMBOL(mount_nodev); int reconfigure_single(struct super_block *s, int flags, void *data) { struct fs_context *fc; int ret; /* The caller really need to be passing fc down into mount_single(), * then a chunk of this can be removed. [Bollocks -- AV] * Better yet, reconfiguration shouldn't happen, but rather the second * mount should be rejected if the parameters are not compatible. */ fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK); if (IS_ERR(fc)) return PTR_ERR(fc); ret = parse_monolithic_mount_data(fc, data); if (ret < 0) goto out; ret = reconfigure_super(fc); out: put_fs_context(fc); return ret; } static int compare_single(struct super_block *s, void *p) { return 1; } struct dentry *mount_single(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; s = sget(fs_type, compare_single, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); if (!s->s_root) { error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (!error) s->s_flags |= SB_ACTIVE; } else { error = reconfigure_single(s, flags, data); } if (unlikely(error)) { deactivate_locked_super(s); return ERR_PTR(error); } return dget(s->s_root); } EXPORT_SYMBOL(mount_single); /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. * * The filesystem is invoked to get or create a superblock which can then later * be used for mounting. The filesystem places a pointer to the root to be * used for mounting in @fc->root. */ int vfs_get_tree(struct fs_context *fc) { struct super_block *sb; int error; if (fc->root) return -EBUSY; /* Get the mountable root in fc->root, with a ref on the root and a ref * on the superblock. */ error = fc->ops->get_tree(fc); if (error < 0) return error; if (!fc->root) { pr_err("Filesystem %s get_tree() didn't set fc->root\n", fc->fs_type->name); /* We don't know what the locking state of the superblock is - * if there is a superblock. */ BUG(); } sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); /* * super_wake() contains a memory barrier which also care of * ordering for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is * the superblock structure contents that we just set up, not * the SB_BORN flag. */ super_wake(sb, SB_BORN); error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); if (unlikely(error)) { fc_drop_locked(fc); return error; } /* * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE * but s_maxbytes was an unsigned long long for many releases. Throw * this warning for a little while to try and catch filesystems that * violate this rule. */ WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes); return 0; } EXPORT_SYMBOL(vfs_get_tree); /* * Setup private BDI for given superblock. It gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) { struct backing_dev_info *bdi; int err; va_list args; bdi = bdi_alloc(NUMA_NO_NODE); if (!bdi) return -ENOMEM; va_start(args, fmt); err = bdi_register_va(bdi, fmt, args); va_end(args); if (err) { bdi_put(bdi); return err; } WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi; sb->s_iflags |= SB_I_PERSB_BDI; return 0; } EXPORT_SYMBOL(super_setup_bdi_name); /* * Setup private BDI for given superblock. I gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi(struct super_block *sb) { static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name, atomic_long_inc_return(&bdi_seq)); } EXPORT_SYMBOL(super_setup_bdi); /** * sb_wait_write - wait until all writers to given file system finish * @sb: the super for which we wait * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file * system. */ static void sb_wait_write(struct super_block *sb, int level) { percpu_down_write(sb->s_writers.rw_sem + level-1); } /* * We are going to return to userspace and forget about these locks, the * ownership goes to the caller of thaw_super() which does unlock(). */ static void lockdep_sb_freeze_release(struct super_block *sb) { int level; for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } /* * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb). */ static void lockdep_sb_freeze_acquire(struct super_block *sb) { int level; for (level = 0; level < SB_FREEZE_LEVELS; ++level) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } static void sb_freeze_unlock(struct super_block *sb, int level) { for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } static int wait_for_partially_frozen(struct super_block *sb) { int ret = 0; do { unsigned short old = sb->s_writers.frozen; up_write(&sb->s_umount); ret = wait_var_event_killable(&sb->s_writers.frozen, sb->s_writers.frozen != old); down_write(&sb->s_umount); } while (ret == 0 && sb->s_writers.frozen != SB_UNFROZEN && sb->s_writers.frozen != SB_FREEZE_COMPLETE); return ret; } /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock * @who: context that wants to freeze * * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs may return * -EBUSY. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs. * * The @who argument distinguishes between the kernel and userspace trying to * freeze the filesystem. Although there cannot be multiple kernel freezes or * multiple userspace freezes in effect at any given time, the kernel and * userspace can both hold a filesystem frozen. The filesystem remains frozen * until there are no kernel or userspace freezes in effect. * * During this function, sb->s_writers.frozen goes through these values: * * SB_UNFROZEN: File system is normal, all writes progress as usual. * * SB_FREEZE_WRITE: The file system is in the process of being frozen. New * writes should be blocked, though page faults are still allowed. We wait for * all writes to complete and then proceed to the next stage. * * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked * but internal fs threads can still modify the filesystem (although they * should not dirty new pages or inodes), writeback can run etc. After waiting * for all running page faults we sync the filesystem which will clean all * dirty pages and inodes (no new dirty pages or inodes can be created when * sync is running). * * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs * modification are blocked (e.g. XFS preallocation truncation on inode * reclaim). This is usually implemented by blocking new transactions for * filesystems that have them and need this additional guard. After all * internal writers are finished we call ->freeze_fs() to finish filesystem * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is * mostly auxiliary for filesystems to verify they do not modify frozen fs. * * sb->s_writers.frozen is protected by sb->s_umount. */ int freeze_super(struct super_block *sb, enum freeze_holder who) { int ret; atomic_inc(&sb->s_active); if (!super_lock_excl(sb)) WARN(1, "Dying superblock while freezing!"); retry: if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { if (sb->s_writers.freeze_holders & who) { deactivate_locked_super(sb); return -EBUSY; } WARN_ON(sb->s_writers.freeze_holders == 0); /* * Someone else already holds this type of freeze; share the * freeze and assign the active ref to the freeze. */ sb->s_writers.freeze_holders |= who; super_unlock_excl(sb); return 0; } if (sb->s_writers.frozen != SB_UNFROZEN) { ret = wait_for_partially_frozen(sb); if (ret) { deactivate_locked_super(sb); return ret; } goto retry; } if (!(sb->s_flags & SB_BORN)) { super_unlock_excl(sb); return 0; /* sic - it's "nothing to do" */ } if (sb_rdonly(sb)) { /* Nothing to do really... */ sb->s_writers.freeze_holders |= who; sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); super_unlock_excl(sb); return 0; } sb->s_writers.frozen = SB_FREEZE_WRITE; /* Release s_umount to preserve sb_start_write -> s_umount ordering */ super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); if (!super_lock_excl(sb)) WARN(1, "Dying superblock while freezing!"); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ ret = sync_filesystem(sb); if (ret) { sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { ret = sb->s_op->freeze_fs(sb); if (ret) { printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } } /* * For debugging purposes so that fs can warn if it sees write activity * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ sb->s_writers.freeze_holders |= who; sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); lockdep_sb_freeze_release(sb); super_unlock_excl(sb); return 0; } EXPORT_SYMBOL(freeze_super); /* * Undoes the effect of a freeze_super_locked call. If the filesystem is * frozen both by userspace and the kernel, a thaw call from either source * removes that state without releasing the other state or unlocking the * filesystem. */ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) { int error; if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { if (!(sb->s_writers.freeze_holders & who)) { super_unlock_excl(sb); return -EINVAL; } /* * Freeze is shared with someone else. Release our hold and * drop the active ref that freeze_super assigned to the * freezer. */ if (sb->s_writers.freeze_holders & ~who) { sb->s_writers.freeze_holders &= ~who; deactivate_locked_super(sb); return 0; } } else { super_unlock_excl(sb); return -EINVAL; } if (sb_rdonly(sb)) { sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); goto out; } lockdep_sb_freeze_acquire(sb); if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { printk(KERN_ERR "VFS:Filesystem thaw failed\n"); lockdep_sb_freeze_release(sb); super_unlock_excl(sb); return error; } } sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); out: deactivate_locked_super(sb); return 0; } /** * thaw_super -- unlock filesystem * @sb: the super to thaw * @who: context that wants to freeze * * Unlocks the filesystem and marks it writeable again after freeze_super() * if there are no remaining freezes on the filesystem. * * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs. */ int thaw_super(struct super_block *sb, enum freeze_holder who) { if (!super_lock_excl(sb)) WARN(1, "Dying superblock while thawing!"); return thaw_super_locked(sb, who); } EXPORT_SYMBOL(thaw_super); /* * Create workqueue for deferred direct IO completions. We allocate the * workqueue when it's first needed. This avoids creating workqueue for * filesystems that don't need it and also allows us to create the workqueue * late enough so the we can include s_id in the name of the workqueue. */ int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", WQ_MEM_RECLAIM, 0, sb->s_id); if (!wq) return -ENOMEM; /* * This has to be atomic as more DIOs can race to create the workqueue */ old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); /* Someone created workqueue before us? Free ours... */ if (old) destroy_workqueue(wq); return 0; }
1386 1760 1510 1478 982 793 32 17 1766 1765 1481 1480 210 263 273 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NETFILTER_H #define __LINUX_NETFILTER_H #include <linux/init.h> #include <linux/skbuff.h> #include <linux/net.h> #include <linux/if.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/wait.h> #include <linux/list.h> #include <linux/static_key.h> #include <linux/module.h> #include <linux/netfilter_defs.h> #include <linux/netdevice.h> #include <linux/sockptr.h> #include <net/net_namespace.h> static inline int NF_DROP_GETERR(int verdict) { return -(verdict >> NF_VERDICT_QBITS); } static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *a2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul1 = (const unsigned long *)a1; const unsigned long *ul2 = (const unsigned long *)a2; return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL; #else return a1->all[0] == a2->all[0] && a1->all[1] == a2->all[1] && a1->all[2] == a2->all[2] && a1->all[3] == a2->all[3]; #endif } static inline void nf_inet_addr_mask(const union nf_inet_addr *a1, union nf_inet_addr *result, const union nf_inet_addr *mask) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ua = (const unsigned long *)a1; unsigned long *ur = (unsigned long *)result; const unsigned long *um = (const unsigned long *)mask; ur[0] = ua[0] & um[0]; ur[1] = ua[1] & um[1]; #else result->all[0] = a1->all[0] & mask->all[0]; result->all[1] = a1->all[1] & mask->all[1]; result->all[2] = a1->all[2] & mask->all[2]; result->all[3] = a1->all[3] & mask->all[3]; #endif } int netfilter_init(void); struct sk_buff; struct nf_hook_ops; struct sock; struct nf_hook_state { u8 hook; u8 pf; struct net_device *in; struct net_device *out; struct sock *sk; struct net *net; int (*okfn)(struct net *, struct sock *, struct sk_buff *); }; typedef unsigned int nf_hookfn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); enum nf_hook_ops_type { NF_HOOK_OP_UNDEFINED, NF_HOOK_OP_NF_TABLES, NF_HOOK_OP_BPF, }; struct nf_hook_ops { /* User fills in from here down. */ nf_hookfn *hook; struct net_device *dev; void *priv; u8 pf; enum nf_hook_ops_type hook_ops_type:8; unsigned int hooknum; /* Hooks are ordered in ascending priority. */ int priority; }; struct nf_hook_entry { nf_hookfn *hook; void *priv; }; struct nf_hook_entries_rcu_head { struct rcu_head head; void *allocation; }; struct nf_hook_entries { u16 num_hook_entries; /* padding */ struct nf_hook_entry hooks[]; /* trailer: pointers to original orig_ops of each hook, * followed by rcu_head and scratch space used for freeing * the structure via call_rcu. * * This is not part of struct nf_hook_entry since its only * needed in slow path (hook register/unregister): * const struct nf_hook_ops *orig_ops[] * * For the same reason, we store this at end -- its * only needed when a hook is deleted, not during * packet path processing: * struct nf_hook_entries_rcu_head head */ }; #ifdef CONFIG_NETFILTER static inline struct nf_hook_ops **nf_hook_entries_get_hook_ops(const struct nf_hook_entries *e) { unsigned int n = e->num_hook_entries; const void *hook_end; hook_end = &e->hooks[n]; /* this is *past* ->hooks[]! */ return (struct nf_hook_ops **)hook_end; } static inline int nf_hook_entry_hookfn(const struct nf_hook_entry *entry, struct sk_buff *skb, struct nf_hook_state *state) { return entry->hook(entry->priv, skb, state); } static inline void nf_hook_state_init(struct nf_hook_state *p, unsigned int hook, u_int8_t pf, struct net_device *indev, struct net_device *outdev, struct sock *sk, struct net *net, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { p->hook = hook; p->pf = pf; p->in = indev; p->out = outdev; p->sk = sk; p->net = net; p->okfn = okfn; } struct nf_sockopt_ops { struct list_head list; u_int8_t pf; /* Non-inclusive ranges: use 0/0/NULL to never get called. */ int set_optmin; int set_optmax; int (*set)(struct sock *sk, int optval, sockptr_t arg, unsigned int len); int get_optmin; int get_optmax; int (*get)(struct sock *sk, int optval, void __user *user, int *len); /* Use the module struct to lock set/get code in place */ struct module *owner; }; /* Function to register/unregister hook points. */ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops); void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops); int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int n); void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int n); /* Functions to register get/setsockopt ranges (non-inclusive). You need to check permissions yourself! */ int nf_register_sockopt(struct nf_sockopt_ops *reg); void nf_unregister_sockopt(struct nf_sockopt_ops *reg); #ifdef CONFIG_JUMP_LABEL extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #endif int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *e, unsigned int i); void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, const struct nf_hook_entries *e); /** * nf_hook - call a netfilter hook * * Returns 1 if the hook has allowed the packet to pass. The function * okfn must be invoked by the caller in this case. Any other return * value indicates the packet has been consumed by the hook. */ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { struct nf_hook_entries *hook_head = NULL; int ret = 1; #ifdef CONFIG_JUMP_LABEL if (__builtin_constant_p(pf) && __builtin_constant_p(hook) && !static_key_false(&nf_hooks_needed[pf][hook])) return 1; #endif rcu_read_lock(); switch (pf) { case NFPROTO_IPV4: hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); break; case NFPROTO_IPV6: hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); break; case NFPROTO_ARP: #ifdef CONFIG_NETFILTER_FAMILY_ARP if (WARN_ON_ONCE(hook >= ARRAY_SIZE(net->nf.hooks_arp))) break; hook_head = rcu_dereference(net->nf.hooks_arp[hook]); #endif break; case NFPROTO_BRIDGE: #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); #endif break; default: WARN_ON_ONCE(1); break; } if (hook_head) { struct nf_hook_state state; nf_hook_state_init(&state, hook, pf, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state, hook_head, 0); } rcu_read_unlock(); return ret; } /* Activate hook; either okfn or kfree_skb called, unless a hook returns NF_STOLEN (in which case, it's up to the hook to deal with the consequences). Returns -ERRNO if packet dropped. Zero means queued, stolen or accepted. */ /* RR: > I don't want nf_hook to return anything because people might forget > about async and trust the return value to mean "packet was ok". AK: Just document it clearly, then you can expect some sense from kernel coders :) */ static inline int NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *), bool cond) { int ret; if (!cond || ((ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn)) == 1)) ret = okfn(net, sk, skb); return ret; } static inline int NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn); if (ret == 1) ret = okfn(net, sk, skb); return ret; } static inline void NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct list_head *head, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { struct nf_hook_entries *hook_head = NULL; #ifdef CONFIG_JUMP_LABEL if (__builtin_constant_p(pf) && __builtin_constant_p(hook) && !static_key_false(&nf_hooks_needed[pf][hook])) return; #endif rcu_read_lock(); switch (pf) { case NFPROTO_IPV4: hook_head = rcu_dereference(net->nf.hooks_ipv4[hook]); break; case NFPROTO_IPV6: hook_head = rcu_dereference(net->nf.hooks_ipv6[hook]); break; default: WARN_ON_ONCE(1); break; } if (hook_head) { struct nf_hook_state state; nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn); nf_hook_slow_list(head, &state, hook_head); } rcu_read_unlock(); } /* Call setsockopt() */ int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, sockptr_t opt, unsigned int len); int nf_getsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt, int *len); struct flowi; struct nf_queue_entry; __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol, unsigned short family); __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, u_int8_t protocol, unsigned short family); int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict, unsigned short family); int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry); #include <net/flow.h> struct nf_conn; enum nf_nat_manip_type; struct nlattr; enum ip_conntrack_dir; struct nf_nat_hook { int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr); void (*decode_session)(struct sk_buff *skb, struct flowi *fl); unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct, enum nf_nat_manip_type mtype, enum ip_conntrack_dir dir); void (*remove_nat_bysrc)(struct nf_conn *ct); }; extern const struct nf_nat_hook __rcu *nf_nat_hook; static inline void nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { #if IS_ENABLED(CONFIG_NF_NAT) const struct nf_nat_hook *nat_hook; rcu_read_lock(); nat_hook = rcu_dereference(nf_nat_hook); if (nat_hook && nat_hook->decode_session) nat_hook->decode_session(skb, fl); rcu_read_unlock(); #endif } #else /* !CONFIG_NETFILTER */ static inline int NF_HOOK_COND(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *), bool cond) { return okfn(net, sk, skb); } static inline int NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { return okfn(net, sk, skb); } static inline void NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct list_head *head, struct net_device *in, struct net_device *out, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { /* nothing to do */ } static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, struct sock *sk, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { return 1; } struct flowi; static inline void nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { } #endif /*CONFIG_NETFILTER*/ #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <linux/netfilter/nf_conntrack_zones_common.h> void nf_ct_attach(struct sk_buff *, const struct sk_buff *); void nf_ct_set_closing(struct nf_conntrack *nfct); struct nf_conntrack_tuple; bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb); #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} static inline void nf_ct_set_closing(struct nf_conntrack *nfct) {} struct nf_conntrack_tuple; static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { return false; } #endif struct nf_conn; enum ip_conntrack_info; struct nf_ct_hook { int (*update)(struct net *net, struct sk_buff *skb); void (*destroy)(struct nf_conntrack *); bool (*get_tuple_skb)(struct nf_conntrack_tuple *, const struct sk_buff *); void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb); void (*set_closing)(struct nf_conntrack *nfct); }; extern const struct nf_ct_hook __rcu *nf_ct_hook; struct nlattr; struct nfnl_ct_hook { size_t (*build_size)(const struct nf_conn *ct); int (*build)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, u_int16_t ct_attr, u_int16_t ct_info_attr); int (*parse)(const struct nlattr *attr, struct nf_conn *ct); int (*attach_expect)(const struct nlattr *attr, struct nf_conn *ct, u32 portid, u32 report); void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, s32 off); }; extern const struct nfnl_ct_hook __rcu *nfnl_ct_hook; struct nf_defrag_hook { struct module *owner; int (*enable)(struct net *net); void (*disable)(struct net *net); }; extern const struct nf_defrag_hook __rcu *nf_defrag_v4_hook; extern const struct nf_defrag_hook __rcu *nf_defrag_v6_hook; /* * nf_skb_duplicated - TEE target has sent a packet * * When a xtables target sends a packet, the OUTPUT and POSTROUTING * hooks are traversed again, i.e. nft and xtables are invoked recursively. * * This is used by xtables TEE target to prevent the duplicated skb from * being duplicated again. */ DECLARE_PER_CPU(bool, nf_skb_duplicated); /* * Contains bitmask of ctnetlink event subscribers, if any. * Can't be pernet due to NETLINK_LISTEN_ALL_NSID setsockopt flag. */ extern u8 nf_ctnetlink_has_listener; #endif /*__LINUX_NETFILTER_H*/
146 146 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor policy manipulation functions * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2017 Canonical Ltd. * * AppArmor policy namespaces, allow for different sets of policies * to be loaded for tasks within the namespace. */ #include <linux/list.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/string.h> #include "include/apparmor.h" #include "include/cred.h" #include "include/policy_ns.h" #include "include/label.h" #include "include/policy.h" /* kernel label */ struct aa_label *kernel_t; /* root profile namespace */ struct aa_ns *root_ns; const char *aa_hidden_ns_name = "---"; /** * aa_ns_visible - test if @view is visible from @curr * @curr: namespace to treat as the parent (NOT NULL) * @view: namespace to test if visible from @curr (NOT NULL) * @subns: whether view of a subns is allowed * * Returns: true if @view is visible from @curr else false */ bool aa_ns_visible(struct aa_ns *curr, struct aa_ns *view, bool subns) { if (curr == view) return true; if (!subns) return false; for ( ; view; view = view->parent) { if (view->parent == curr) return true; } return false; } /** * aa_ns_name - Find the ns name to display for @view from @curr * @curr: current namespace (NOT NULL) * @view: namespace attempting to view (NOT NULL) * @subns: are subns visible * * Returns: name of @view visible from @curr */ const char *aa_ns_name(struct aa_ns *curr, struct aa_ns *view, bool subns) { /* if view == curr then the namespace name isn't displayed */ if (curr == view) return ""; if (aa_ns_visible(curr, view, subns)) { /* at this point if a ns is visible it is in a view ns * thus the curr ns.hname is a prefix of its name. * Only output the virtualized portion of the name * Add + 2 to skip over // separating curr hname prefix * from the visible tail of the views hname */ return view->base.hname + strlen(curr->base.hname) + 2; } return aa_hidden_ns_name; } static struct aa_profile *alloc_unconfined(const char *name) { struct aa_profile *profile; profile = aa_alloc_null(NULL, name, GFP_KERNEL); if (!profile) return NULL; profile->label.flags |= FLAG_IX_ON_NAME_ERROR | FLAG_IMMUTIBLE | FLAG_NS_COUNT | FLAG_UNCONFINED; profile->mode = APPARMOR_UNCONFINED; return profile; } /** * alloc_ns - allocate, initialize and return a new namespace * @prefix: parent namespace name (MAYBE NULL) * @name: a preallocated name (NOT NULL) * * Returns: refcounted namespace or NULL on failure. */ static struct aa_ns *alloc_ns(const char *prefix, const char *name) { struct aa_ns *ns; ns = kzalloc(sizeof(*ns), GFP_KERNEL); AA_DEBUG("%s(%p)\n", __func__, ns); if (!ns) return NULL; if (!aa_policy_init(&ns->base, prefix, name, GFP_KERNEL)) goto fail_ns; INIT_LIST_HEAD(&ns->sub_ns); INIT_LIST_HEAD(&ns->rawdata_list); mutex_init(&ns->lock); init_waitqueue_head(&ns->wait); /* released by aa_free_ns() */ ns->unconfined = alloc_unconfined("unconfined"); if (!ns->unconfined) goto fail_unconfined; /* ns and ns->unconfined share ns->unconfined refcount */ ns->unconfined->ns = ns; atomic_set(&ns->uniq_null, 0); aa_labelset_init(&ns->labels); return ns; fail_unconfined: aa_policy_destroy(&ns->base); fail_ns: kfree_sensitive(ns); return NULL; } /** * aa_free_ns - free a profile namespace * @ns: the namespace to free (MAYBE NULL) * * Requires: All references to the namespace must have been put, if the * namespace was referenced by a profile confining a task, */ void aa_free_ns(struct aa_ns *ns) { if (!ns) return; aa_policy_destroy(&ns->base); aa_labelset_destroy(&ns->labels); aa_put_ns(ns->parent); ns->unconfined->ns = NULL; aa_free_profile(ns->unconfined); kfree_sensitive(ns); } /** * aa_findn_ns - look up a profile namespace on the namespace list * @root: namespace to search in (NOT NULL) * @name: name of namespace to find (NOT NULL) * @n: length of @name * * Returns: a refcounted namespace on the list, or NULL if no namespace * called @name exists. * * refcount released by caller */ struct aa_ns *aa_findn_ns(struct aa_ns *root, const char *name, size_t n) { struct aa_ns *ns = NULL; rcu_read_lock(); ns = aa_get_ns(__aa_findn_ns(&root->sub_ns, name, n)); rcu_read_unlock(); return ns; } /** * aa_find_ns - look up a profile namespace on the namespace list * @root: namespace to search in (NOT NULL) * @name: name of namespace to find (NOT NULL) * * Returns: a refcounted namespace on the list, or NULL if no namespace * called @name exists. * * refcount released by caller */ struct aa_ns *aa_find_ns(struct aa_ns *root, const char *name) { return aa_findn_ns(root, name, strlen(name)); } /** * __aa_lookupn_ns - lookup the namespace matching @hname * @view: namespace to search in (NOT NULL) * @hname: hierarchical ns name (NOT NULL) * @n: length of @hname * * Requires: rcu_read_lock be held * * Returns: unrefcounted ns pointer or NULL if not found * * Do a relative name lookup, recursing through profile tree. */ struct aa_ns *__aa_lookupn_ns(struct aa_ns *view, const char *hname, size_t n) { struct aa_ns *ns = view; const char *split; for (split = strnstr(hname, "//", n); split; split = strnstr(hname, "//", n)) { ns = __aa_findn_ns(&ns->sub_ns, hname, split - hname); if (!ns) return NULL; n -= split + 2 - hname; hname = split + 2; } if (n) return __aa_findn_ns(&ns->sub_ns, hname, n); return NULL; } /** * aa_lookupn_ns - look up a policy namespace relative to @view * @view: namespace to search in (NOT NULL) * @name: name of namespace to find (NOT NULL) * @n: length of @name * * Returns: a refcounted namespace on the list, or NULL if no namespace * called @name exists. * * refcount released by caller */ struct aa_ns *aa_lookupn_ns(struct aa_ns *view, const char *name, size_t n) { struct aa_ns *ns = NULL; rcu_read_lock(); ns = aa_get_ns(__aa_lookupn_ns(view, name, n)); rcu_read_unlock(); return ns; } static struct aa_ns *__aa_create_ns(struct aa_ns *parent, const char *name, struct dentry *dir) { struct aa_ns *ns; int error; AA_BUG(!parent); AA_BUG(!name); AA_BUG(!mutex_is_locked(&parent->lock)); ns = alloc_ns(parent->base.hname, name); if (!ns) return ERR_PTR(-ENOMEM); ns->level = parent->level + 1; mutex_lock_nested(&ns->lock, ns->level); error = __aafs_ns_mkdir(ns, ns_subns_dir(parent), name, dir); if (error) { AA_ERROR("Failed to create interface for ns %s\n", ns->base.name); mutex_unlock(&ns->lock); aa_free_ns(ns); return ERR_PTR(error); } ns->parent = aa_get_ns(parent); list_add_rcu(&ns->base.list, &parent->sub_ns); /* add list ref */ aa_get_ns(ns); mutex_unlock(&ns->lock); return ns; } /** * __aa_find_or_create_ns - create an ns, fail if it already exists * @parent: the parent of the namespace being created * @name: the name of the namespace * @dir: if not null the dir to put the ns entries in * * Returns: the a refcounted ns that has been add or an ERR_PTR */ struct aa_ns *__aa_find_or_create_ns(struct aa_ns *parent, const char *name, struct dentry *dir) { struct aa_ns *ns; AA_BUG(!mutex_is_locked(&parent->lock)); /* try and find the specified ns */ /* released by caller */ ns = aa_get_ns(__aa_find_ns(&parent->sub_ns, name)); if (!ns) ns = __aa_create_ns(parent, name, dir); else ns = ERR_PTR(-EEXIST); /* return ref */ return ns; } /** * aa_prepare_ns - find an existing or create a new namespace of @name * @parent: ns to treat as parent * @name: the namespace to find or add (NOT NULL) * * Returns: refcounted namespace or PTR_ERR if failed to create one */ struct aa_ns *aa_prepare_ns(struct aa_ns *parent, const char *name) { struct aa_ns *ns; mutex_lock_nested(&parent->lock, parent->level); /* try and find the specified ns and if it doesn't exist create it */ /* released by caller */ ns = aa_get_ns(__aa_find_ns(&parent->sub_ns, name)); if (!ns) ns = __aa_create_ns(parent, name, NULL); mutex_unlock(&parent->lock); /* return ref */ return ns; } static void __ns_list_release(struct list_head *head); /** * destroy_ns - remove everything contained by @ns * @ns: namespace to have it contents removed (NOT NULL) */ static void destroy_ns(struct aa_ns *ns) { if (!ns) return; mutex_lock_nested(&ns->lock, ns->level); /* release all profiles in this namespace */ __aa_profile_list_release(&ns->base.profiles); /* release all sub namespaces */ __ns_list_release(&ns->sub_ns); if (ns->parent) { unsigned long flags; write_lock_irqsave(&ns->labels.lock, flags); __aa_proxy_redirect(ns_unconfined(ns), ns_unconfined(ns->parent)); write_unlock_irqrestore(&ns->labels.lock, flags); } __aafs_ns_rmdir(ns); mutex_unlock(&ns->lock); } /** * __aa_remove_ns - remove a namespace and all its children * @ns: namespace to be removed (NOT NULL) * * Requires: ns->parent->lock be held and ns removed from parent. */ void __aa_remove_ns(struct aa_ns *ns) { /* remove ns from namespace list */ list_del_rcu(&ns->base.list); destroy_ns(ns); aa_put_ns(ns); } /** * __ns_list_release - remove all profile namespaces on the list put refs * @head: list of profile namespaces (NOT NULL) * * Requires: namespace lock be held */ static void __ns_list_release(struct list_head *head) { struct aa_ns *ns, *tmp; list_for_each_entry_safe(ns, tmp, head, base.list) __aa_remove_ns(ns); } /** * aa_alloc_root_ns - allocate the root profile namespace * * Returns: %0 on success else error * */ int __init aa_alloc_root_ns(void) { struct aa_profile *kernel_p; /* released by aa_free_root_ns - used as list ref*/ root_ns = alloc_ns(NULL, "root"); if (!root_ns) return -ENOMEM; kernel_p = alloc_unconfined("kernel_t"); if (!kernel_p) { destroy_ns(root_ns); aa_free_ns(root_ns); return -ENOMEM; } kernel_t = &kernel_p->label; root_ns->unconfined->ns = aa_get_ns(root_ns); return 0; } /** * aa_free_root_ns - free the root profile namespace */ void __init aa_free_root_ns(void) { struct aa_ns *ns = root_ns; root_ns = NULL; aa_label_free(kernel_t); destroy_ns(ns); aa_put_ns(ns); }
39 39 39 53 53 39 51 52 17 53 52 1061 53 53 52 53 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * No idle tick implementation for low and high resolution timers * * Started by: Thomas Gleixner and Ingo Molnar */ #include <linux/cpu.h> #include <linux/err.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/percpu.h> #include <linux/nmi.h> #include <linux/profile.h> #include <linux/sched/signal.h> #include <linux/sched/clock.h> #include <linux/sched/stat.h> #include <linux/sched/nohz.h> #include <linux/sched/loadavg.h> #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> #include <linux/context_tracking.h> #include <linux/mm.h> #include <asm/irq_regs.h> #include "tick-internal.h" #include <trace/events/timer.h> /* * Per-CPU nohz control structure */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) /* * The time, when the last jiffy update happened. Write access must hold * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a * consistent view of jiffies and last_jiffies_update. */ static ktime_t last_jiffies_update; /* * Must be called with interrupts disabled ! */ static void tick_do_update_jiffies64(ktime_t now) { unsigned long ticks = 1; ktime_t delta, nextp; /* * 64bit can do a quick check without holding jiffies lock and * without looking at the sequence count. The smp_load_acquire() * pairs with the update done later in this function. * * 32bit cannot do that because the store of tick_next_period * consists of two 32bit stores and the first store could move it * to a random point in the future. */ if (IS_ENABLED(CONFIG_64BIT)) { if (ktime_before(now, smp_load_acquire(&tick_next_period))) return; } else { unsigned int seq; /* * Avoid contention on jiffies_lock and protect the quick * check with the sequence count. */ do { seq = read_seqcount_begin(&jiffies_seq); nextp = tick_next_period; } while (read_seqcount_retry(&jiffies_seq, seq)); if (ktime_before(now, nextp)) return; } /* Quick check failed, i.e. update is required. */ raw_spin_lock(&jiffies_lock); /* * Reevaluate with the lock held. Another CPU might have done the * update already. */ if (ktime_before(now, tick_next_period)) { raw_spin_unlock(&jiffies_lock); return; } write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, tick_next_period); if (unlikely(delta >= TICK_NSEC)) { /* Slow path for long idle sleep times */ s64 incr = TICK_NSEC; ticks += ktime_divns(delta, incr); last_jiffies_update = ktime_add_ns(last_jiffies_update, incr * ticks); } else { last_jiffies_update = ktime_add_ns(last_jiffies_update, TICK_NSEC); } /* Advance jiffies to complete the jiffies_seq protected job */ jiffies_64 += ticks; /* * Keep the tick_next_period variable up to date. */ nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); if (IS_ENABLED(CONFIG_64BIT)) { /* * Pairs with smp_load_acquire() in the lockless quick * check above and ensures that the update to jiffies_64 is * not reordered vs. the store to tick_next_period, neither * by the compiler nor by the CPU. */ smp_store_release(&tick_next_period, nextp); } else { /* * A plain store is good enough on 32bit as the quick check * above is protected by the sequence count. */ tick_next_period = nextp; } /* * Release the sequence count. calc_global_load() below is not * protected by it, but jiffies_lock needs to be held to prevent * concurrent invocations. */ write_seqcount_end(&jiffies_seq); calc_global_load(); raw_spin_unlock(&jiffies_lock); update_wall_time(); } /* * Initialize and return retrieve the jiffies update. */ static ktime_t tick_init_jiffy_update(void) { ktime_t period; raw_spin_lock(&jiffies_lock); write_seqcount_begin(&jiffies_seq); /* Did we start the jiffies update yet ? */ if (last_jiffies_update == 0) { u32 rem; /* * Ensure that the tick is aligned to a multiple of * TICK_NSEC. */ div_u64_rem(tick_next_period, TICK_NSEC, &rem); if (rem) tick_next_period += TICK_NSEC - rem; last_jiffies_update = tick_next_period; } period = last_jiffies_update; write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); return period; } #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) { int cpu = smp_processor_id(); #ifdef CONFIG_NO_HZ_COMMON /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the CPU in charge went * into a long sleep. If two CPUs happen to assign themselves to * this duty, then the jiffies update is still serialized by * jiffies_lock. * * If nohz_full is enabled, this should not happen because the * tick_do_timer_cpu never relinquishes. */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { #ifdef CONFIG_NO_HZ_FULL WARN_ON_ONCE(tick_nohz_full_running); #endif tick_do_timer_cpu = cpu; } #endif /* Check, if the jiffies need an update */ if (tick_do_timer_cpu == cpu) tick_do_update_jiffies64(now); /* * If jiffies update stalled for too long (timekeeper in stop_machine() * or VMEXIT'ed for several msecs), force an update. */ if (ts->last_tick_jiffies != jiffies) { ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { tick_do_update_jiffies64(now); ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } } if (ts->inidle) ts->got_idle_tick = 1; } static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { #ifdef CONFIG_NO_HZ_COMMON /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long * time. This happens on complete idle SMP systems while * waiting on the login prompt. We also increment the "start of * idle" jiffy stamp so the idle accounting adjustment we do * when we go busy again does not account too much ticks. */ if (ts->tick_stopped) { touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; /* * In case the current tick fired too early past its expected * expiration, make sure we don't bypass the next clock reprogramming * to the same deadline. */ ts->next_tick = 0; } #endif update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } #endif #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; EXPORT_SYMBOL_GPL(tick_nohz_full_mask); bool tick_nohz_full_running; EXPORT_SYMBOL_GPL(tick_nohz_full_running); static atomic_t tick_dep_mask; static bool check_tick_dependency(atomic_t *dep) { int val = atomic_read(dep); if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return true; } if (val & TICK_DEP_MASK_PERF_EVENTS) { trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); return true; } if (val & TICK_DEP_MASK_SCHED) { trace_tick_stop(0, TICK_DEP_MASK_SCHED); return true; } if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); return true; } if (val & TICK_DEP_MASK_RCU) { trace_tick_stop(0, TICK_DEP_MASK_RCU); return true; } if (val & TICK_DEP_MASK_RCU_EXP) { trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP); return true; } return false; } static bool can_stop_full_tick(int cpu, struct tick_sched *ts) { lockdep_assert_irqs_disabled(); if (unlikely(!cpu_online(cpu))) return false; if (check_tick_dependency(&tick_dep_mask)) return false; if (check_tick_dependency(&ts->tick_dep_mask)) return false; if (check_tick_dependency(&current->tick_dep_mask)) return false; if (check_tick_dependency(&current->signal->tick_dep_mask)) return false; return true; } static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = IRQ_WORK_INIT_HARD(nohz_full_kick_func); /* * Kick this CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; irq_work_queue(this_cpu_ptr(&nohz_full_kick_work)); } /* * Kick the CPU if it's full dynticks in order to force it to * re-evaluate its dependency on the tick and restart it if necessary. */ void tick_nohz_full_kick_cpu(int cpu) { if (!tick_nohz_full_cpu(cpu)) return; irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } static void tick_nohz_kick_task(struct task_struct *tsk) { int cpu; /* * If the task is not running, run_posix_cpu_timers() * has nothing to elapse, IPI can then be spared. * * activate_task() STORE p->tick_dep_mask * STORE p->on_rq * __schedule() (switch to task 'p') smp_mb() (atomic_fetch_or()) * LOCK rq->lock LOAD p->on_rq * smp_mb__after_spin_lock() * tick_nohz_task_switch() * LOAD p->tick_dep_mask */ if (!sched_task_on_rq(tsk)) return; /* * If the task concurrently migrates to another CPU, * we guarantee it sees the new tick dependency upon * schedule. * * set_task_cpu(p, cpu); * STORE p->cpu = @cpu * __schedule() (switch to task 'p') * LOCK rq->lock * smp_mb__after_spin_lock() STORE p->tick_dep_mask * tick_nohz_task_switch() smp_mb() (atomic_fetch_or()) * LOAD p->tick_dep_mask LOAD p->cpu */ cpu = task_cpu(tsk); preempt_disable(); if (cpu_online(cpu)) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ static void tick_nohz_full_kick_all(void) { int cpu; if (!tick_nohz_full_running) return; preempt_disable(); for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) tick_nohz_full_kick_cpu(cpu); preempt_enable(); } static void tick_nohz_dep_set_all(atomic_t *dep, enum tick_dep_bits bit) { int prev; prev = atomic_fetch_or(BIT(bit), dep); if (!prev) tick_nohz_full_kick_all(); } /* * Set a global tick dependency. Used by perf events that rely on freq and * by unstable clock. */ void tick_nohz_dep_set(enum tick_dep_bits bit) { tick_nohz_dep_set_all(&tick_dep_mask, bit); } void tick_nohz_dep_clear(enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tick_dep_mask); } /* * Set per-CPU tick dependency. Used by scheduler and perf events in order to * manage events throttling. */ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) { int prev; struct tick_sched *ts; ts = per_cpu_ptr(&tick_cpu_sched, cpu); prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask); if (!prev) { preempt_disable(); /* Perf needs local kick that is NMI safe */ if (cpu == smp_processor_id()) { tick_nohz_full_kick(); } else { /* Remote irq work not NMI-safe */ if (!WARN_ON_ONCE(in_nmi())) tick_nohz_full_kick_cpu(cpu); } preempt_enable(); } } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu); void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); atomic_andnot(BIT(bit), &ts->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu); /* * Set a per-task tick dependency. RCU need this. Also posix CPU timers * in order to elapse per task timers. */ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) { if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) tick_nohz_kick_task(tsk); } EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task); void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &tsk->tick_dep_mask); } EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task); /* * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse * per process timers. */ void tick_nohz_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit) { int prev; struct signal_struct *sig = tsk->signal; prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask); if (!prev) { struct task_struct *t; lockdep_assert_held(&tsk->sighand->siglock); __for_each_thread(sig, t) tick_nohz_kick_task(t); } } void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) { atomic_andnot(BIT(bit), &sig->tick_dep_mask); } /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: * perf events, posix CPU timers, ... */ void __tick_nohz_task_switch(void) { struct tick_sched *ts; if (!tick_nohz_full_cpu(smp_processor_id())) return; ts = this_cpu_ptr(&tick_cpu_sched); if (ts->tick_stopped) { if (atomic_read(&current->tick_dep_mask) || atomic_read(&current->signal->tick_dep_mask)) tick_nohz_full_kick(); } } /* Get the boot-time nohz CPU list from the kernel parameters. */ void __init tick_nohz_full_setup(cpumask_var_t cpumask) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); cpumask_copy(tick_nohz_full_mask, cpumask); tick_nohz_full_running = true; } bool tick_nohz_cpu_hotpluggable(unsigned int cpu) { /* * The tick_do_timer_cpu CPU handles housekeeping duty (unbound * timers, workqueues, timekeeping, ...) on behalf of full dynticks * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && tick_do_timer_cpu == cpu) return false; return true; } static int tick_nohz_cpu_down(unsigned int cpu) { return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY; } void __init tick_nohz_init(void) { int cpu, ret; if (!tick_nohz_full_running) return; /* * Full dynticks uses irq work to drive the tick rescheduling on safe * locking contexts. But then we need irq work to raise its own * interrupts to avoid circular dependency on the tick */ if (!arch_irq_work_has_interrupt()) { pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); tick_nohz_full_running = false; return; } if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { pr_warn("NO_HZ: Clearing %d from nohz_full range " "for timekeeping\n", cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } } for_each_cpu(cpu, tick_nohz_full_mask) ct_cpu_track_user(cpu); ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "kernel/nohz:predown", NULL, tick_nohz_cpu_down); WARN_ON(ret < 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); } #endif /* * NOHZ - aka dynamic tick functionality */ #ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); bool tick_nohz_tick_stopped(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return ts->tick_stopped; } bool tick_nohz_tick_stopped_cpu(int cpu) { struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); return ts->tick_stopped; } /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * * Called from interrupt entry when the CPU was idle * * In case the sched_tick was stopped on this CPU, we have to check if jiffies * must be updated. Otherwise an interrupt handler could use a stale jiffy * value. We do this unconditionally on any CPU, as we don't know whether the * CPU, which has the update task assigned is in a long sleep. */ static void tick_nohz_update_jiffies(ktime_t now) { unsigned long flags; __this_cpu_write(tick_cpu_sched.idle_waketime, now); local_irq_save(flags); tick_do_update_jiffies64(now); local_irq_restore(flags); touch_softlockup_watchdog_sched(); } static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now) { ktime_t delta; if (WARN_ON_ONCE(!ts->idle_active)) return; delta = ktime_sub(now, ts->idle_entrytime); write_seqcount_begin(&ts->idle_sleeptime_seq); if (nr_iowait_cpu(smp_processor_id()) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); else ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); ts->idle_entrytime = now; ts->idle_active = 0; write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_wakeup_event(); } static void tick_nohz_start_idle(struct tick_sched *ts) { write_seqcount_begin(&ts->idle_sleeptime_seq); ts->idle_entrytime = ktime_get(); ts->idle_active = 1; write_seqcount_end(&ts->idle_sleeptime_seq); sched_clock_idle_sleep_event(); } static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime, bool compute_delta, u64 *last_update_time) { ktime_t now, idle; unsigned int seq; if (!tick_nohz_active) return -1; now = ktime_get(); if (last_update_time) *last_update_time = ktime_to_us(now); do { seq = read_seqcount_begin(&ts->idle_sleeptime_seq); if (ts->idle_active && compute_delta) { ktime_t delta = ktime_sub(now, ts->idle_entrytime); idle = ktime_add(*sleeptime, delta); } else { idle = *sleeptime; } } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq)); return ktime_to_us(idle); } /** * get_cpu_idle_time_us - get the total idle time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative idle time (since boot) for a given * CPU, in microseconds. Note this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * This function returns -1 if NOHZ is not enabled. */ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime, !nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * get_cpu_iowait_time_us - get the total iowait time of a CPU * @cpu: CPU number to query * @last_update_time: variable to store update time in. Do not update * counters if NULL. * * Return the cumulative iowait time (since boot) for a given * CPU, in microseconds. Note this is partially broken due to * the counter of iowait tasks that can be remotely updated without * any synchronization. Therefore it is possible to observe backward * values within two consecutive reads. * * This time is measured via accounting rather than sampling, * and is as accurate as ktime_get() is. * * This function returns -1 if NOHZ is not enabled. */ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime, nr_iowait_cpu(cpu), last_update_time); } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { hrtimer_cancel(&ts->sched_timer); hrtimer_set_expires(&ts->sched_timer, ts->last_tick); /* Forward the time to expire in the future */ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); } else { tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } /* * Reset to make sure next tick stop doesn't get fooled by past * cached clock deadline. */ ts->next_tick = 0; } static inline bool local_timer_softirq_pending(void) { return local_softirq_pending() & BIT(TIMER_SOFTIRQ); } static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu) { u64 basemono, next_tick, delta, expires; unsigned long basejiff; unsigned int seq; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqcount_begin(&jiffies_seq); basemono = last_jiffies_update; basejiff = jiffies; } while (read_seqcount_retry(&jiffies_seq, seq)); ts->last_jiffies = basejiff; ts->timer_expires_base = basemono; /* * Keep the periodic tick, when RCU, architecture or irq_work * requests it. * Aside of that check whether the local timer softirq is * pending. If so its a bad idea to call get_next_timer_interrupt() * because there is an already expired timer, so it will request * immediate expiry, which rearms the hardware timer with a * minimal delta which brings us back to this place * immediately. Lather, rinse and repeat... */ if (rcu_needs_cpu() || arch_needs_cpu() || irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* * Get the next pending timer. If high resolution * timers are enabled this only takes the timer wheel * timers into account. If high resolution timers are * disabled this also looks at the next expiring * hrtimer. */ next_tick = get_next_timer_interrupt(basejiff, basemono); ts->next_timer = next_tick; } /* * If the tick is due in the next period, keep it ticking or * force prod the timer. */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { /* * Tell the timer code that the base is not idle, i.e. undo * the effect of get_next_timer_interrupt(): */ timer_clear_idle(); /* * We've not stopped the tick yet, and there's a timer in the * next period, so no point in stopping it either, bail. */ if (!ts->tick_stopped) { ts->timer_expires = 0; goto out; } } /* * If this CPU is the one which had the do_timer() duty last, we limit * the sleep time to the timekeeping max_deferment value. * Otherwise we can sleep as long as we want. */ delta = timekeeping_max_deferment(); if (cpu != tick_do_timer_cpu && (tick_do_timer_cpu != TICK_DO_TIMER_NONE || !ts->do_timer_last)) delta = KTIME_MAX; /* Calculate the next expiry time */ if (delta < (KTIME_MAX - basemono)) expires = basemono + delta; else expires = KTIME_MAX; ts->timer_expires = min_t(u64, expires, next_tick); out: return ts->timer_expires; } static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); u64 basemono = ts->timer_expires_base; u64 expires = ts->timer_expires; ktime_t tick = expires; /* Make sure we won't be trying to stop it twice in a row. */ ts->timer_expires_base = 0; /* * If this CPU is the one which updates jiffies, then give up * the assignment and let it be taken by the CPU which runs * the tick timer next, which might be this CPU as well. If we * don't drop this here the jiffies might be stale and * do_timer() never invoked. Keep track of the fact that it * was the one which had the do_timer() duty last. */ if (cpu == tick_do_timer_cpu) { tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->do_timer_last = 1; } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { ts->do_timer_last = 0; } /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && (expires == ts->next_tick)) { /* Sanity check: make sure clockevent is actually programmed */ if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer)) return; WARN_ON_ONCE(1); printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick, dev->next_event, hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer)); } /* * nohz_stop_sched_tick can be called several times before * the nohz_restart_sched_tick is called. This happens when * interrupts arrive which do not cause a reschedule. In the * first call we save the current tick time, so we can restart * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { calc_load_nohz_start(); quiet_vmstat(); ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; trace_tick_stop(1, TICK_DEP_MASK_NONE); } ts->next_tick = tick; /* * If the expiration time == KTIME_MAX, then we simply stop * the tick timer. */ if (unlikely(expires == KTIME_MAX)) { if (ts->nohz_mode == NOHZ_MODE_HIGHRES) hrtimer_cancel(&ts->sched_timer); else tick_program_event(KTIME_MAX, 1); return; } if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED_HARD); } else { hrtimer_set_expires(&ts->sched_timer, tick); tick_program_event(tick, 1); } } static void tick_nohz_retain_tick(struct tick_sched *ts) { ts->timer_expires_base = 0; } #ifdef CONFIG_NO_HZ_FULL static void tick_nohz_stop_sched_tick(struct tick_sched *ts, int cpu) { if (tick_nohz_next_event(ts, cpu)) tick_nohz_stop_tick(ts, cpu); else tick_nohz_retain_tick(ts); } #endif /* CONFIG_NO_HZ_FULL */ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ tick_do_update_jiffies64(now); /* * Clear the timer idle flag, so we avoid IPIs on remote queueing and * the clock forward checks in the enqueue path: */ timer_clear_idle(); calc_load_nohz_stop(); touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick */ ts->tick_stopped = 0; tick_nohz_restart(ts, now); } static void __tick_nohz_full_update_tick(struct tick_sched *ts, ktime_t now) { #ifdef CONFIG_NO_HZ_FULL int cpu = smp_processor_id(); if (can_stop_full_tick(cpu, ts)) tick_nohz_stop_sched_tick(ts, cpu); else if (ts->tick_stopped) tick_nohz_restart_sched_tick(ts, now); #endif } static void tick_nohz_full_update_tick(struct tick_sched *ts) { if (!tick_nohz_full_cpu(smp_processor_id())) return; if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; __tick_nohz_full_update_tick(ts, ktime_get()); } /* * A pending softirq outside an IRQ (or softirq disabled section) context * should be waiting for ksoftirqd to handle it. Therefore we shouldn't * reach here due to the need_resched() early check in can_stop_idle_tick(). * * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the * cpu_down() process, softirqs can still be raised while ksoftirqd is parked, * triggering the below since wakep_softirqd() is ignored. * */ static bool report_idle_softirq(void) { static int ratelimit; unsigned int pending = local_softirq_pending(); if (likely(!pending)) return false; /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */ if (!cpu_active(smp_processor_id())) { pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK; if (!pending) return false; } if (ratelimit >= 10) return false; /* On RT, softirqs handling may be waiting on some lock */ if (local_bh_blocked()) return false; pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", pending); ratelimit++; return true; } static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) { /* * If this CPU is offline and it is the one which updates * jiffies, then give up the assignment and let it be taken by * the CPU which runs the tick timer next. If we don't drop * this here the jiffies might be stale and do_timer() never * invoked. */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) tick_do_timer_cpu = TICK_DO_TIMER_NONE; /* * Make sure the CPU doesn't get fooled by obsolete tick * deadline if it comes back online later. */ ts->next_tick = 0; return false; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) return false; if (need_resched()) return false; if (unlikely(report_idle_softirq())) return false; if (tick_nohz_full_enabled()) { /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around */ if (tick_do_timer_cpu == cpu) return false; /* Should not happen for nohz-full */ if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) return false; } return true; } /** * tick_nohz_idle_stop_tick - stop the idle tick from the idle task * * When the next event is more than a tick into the future, stop the idle tick */ void tick_nohz_idle_stop_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); ktime_t expires; /* * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the * tick timer expiration time is known already. */ if (ts->timer_expires_base) expires = ts->timer_expires; else if (can_stop_idle_tick(cpu, ts)) expires = tick_nohz_next_event(ts, cpu); else return; ts->idle_calls++; if (expires > 0LL) { int was_stopped = ts->tick_stopped; tick_nohz_stop_tick(ts, cpu); ts->idle_sleeps++; ts->idle_expires = expires; if (!was_stopped && ts->tick_stopped) { ts->idle_jiffies = ts->last_jiffies; nohz_balance_enter_idle(cpu); } } else { tick_nohz_retain_tick(ts); } } void tick_nohz_idle_retain_tick(void) { tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched)); /* * Undo the effect of get_next_timer_interrupt() called from * tick_nohz_next_event(). */ timer_clear_idle(); } /** * tick_nohz_idle_enter - prepare for entering idle on the current CPU * * Called when we start the idle loop. */ void tick_nohz_idle_enter(void) { struct tick_sched *ts; lockdep_assert_irqs_enabled(); local_irq_disable(); ts = this_cpu_ptr(&tick_cpu_sched); WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 1; tick_nohz_start_idle(ts); local_irq_enable(); } /** * tick_nohz_irq_exit - update next tick event from interrupt exit * * When an interrupt fires while we are idle and it doesn't cause * a reschedule, it may still add, modify or delete a timer, enqueue * an RCU callback, etc... * So we need to re-calculate and reprogram the next tick event. */ void tick_nohz_irq_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->inidle) tick_nohz_start_idle(ts); else tick_nohz_full_update_tick(ts); } /** * tick_nohz_idle_got_tick - Check whether or not the tick handler has run */ bool tick_nohz_idle_got_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->got_idle_tick) { ts->got_idle_tick = 0; return true; } return false; } /** * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer * or the tick, whatever that expires first. Note that, if the tick has been * stopped, it returns the next hrtimer. * * Called from power state control code with interrupts disabled */ ktime_t tick_nohz_get_next_hrtimer(void) { return __this_cpu_read(tick_cpu_device.evtdev)->next_event; } /** * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * * Called from power state control code with interrupts disabled. * * The return value of this function and/or the value returned by it through the * @delta_next pointer can be negative which must be taken into account by its * callers. */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); int cpu = smp_processor_id(); /* * The idle entry time is expected to be a sufficient approximation of * the current time at this point. */ ktime_t now = ts->idle_entrytime; ktime_t next_event; WARN_ON_ONCE(!ts->inidle); *delta_next = ktime_sub(dev->next_event, now); if (!can_stop_idle_tick(cpu, ts)) return *delta_next; next_event = tick_nohz_next_event(ts, cpu); if (!next_event) return *delta_next; /* * If the next highres timer to expire is earlier than next_event, the * idle governor needs to know that. */ next_event = min_t(u64, next_event, hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); } /** * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value * for a particular CPU. * * Called from the schedutil frequency scaling governor in scheduler context. */ unsigned long tick_nohz_get_idle_calls_cpu(int cpu) { struct tick_sched *ts = tick_get_tick_sched(cpu); return ts->idle_calls; } /** * tick_nohz_get_idle_calls - return the current idle calls counter value * * Called from the schedutil frequency scaling governor in scheduler context. */ unsigned long tick_nohz_get_idle_calls(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); return ts->idle_calls; } static void tick_nohz_account_idle_time(struct tick_sched *ts, ktime_t now) { unsigned long ticks; ts->idle_exittime = now; if (vtime_accounting_enabled_this_cpu()) return; /* * We stopped the tick in idle. Update process times would miss the * time we slept as update_process_times does only a 1 tick * accounting. Enforce that this is accounted to idle ! */ ticks = jiffies - ts->idle_jiffies; /* * We might be one off. Do not randomly account a huge number of ticks! */ if (ticks && ticks < LONG_MAX) account_idle_ticks(ticks); } void tick_nohz_idle_restart_tick(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (ts->tick_stopped) { ktime_t now = ktime_get(); tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } } static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now) { if (tick_nohz_full_cpu(smp_processor_id())) __tick_nohz_full_update_tick(ts, now); else tick_nohz_restart_sched_tick(ts, now); tick_nohz_account_idle_time(ts, now); } /** * tick_nohz_idle_exit - restart the idle tick from the idle task * * Restart the idle tick when the CPU is woken up from idle * This also exit the RCU extended quiescent state. The CPU * can use RCU again after this function is called. */ void tick_nohz_idle_exit(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); bool idle_active, tick_stopped; ktime_t now; local_irq_disable(); WARN_ON_ONCE(!ts->inidle); WARN_ON_ONCE(ts->timer_expires_base); ts->inidle = 0; idle_active = ts->idle_active; tick_stopped = ts->tick_stopped; if (idle_active || tick_stopped) now = ktime_get(); if (idle_active) tick_nohz_stop_idle(ts, now); if (tick_stopped) tick_nohz_idle_update_tick(ts, now); local_irq_enable(); } /* * The nohz low res interrupt handler */ static void tick_nohz_handler(struct clock_event_device *dev) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); dev->next_event = KTIME_MAX; tick_sched_do_timer(ts, now); tick_sched_handle(ts, regs); if (unlikely(ts->tick_stopped)) { /* * The clockevent device is not reprogrammed, so change the * clock event device to ONESHOT_STOPPED to avoid spurious * interrupts on devices which might not be truly one shot. */ tick_program_event(KTIME_MAX, 1); return; } hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { if (!tick_nohz_enabled) return; ts->nohz_mode = mode; /* One update is enough */ if (!test_and_set_bit(0, &tick_nohz_active)) timers_update_nohz(); } /** * tick_nohz_switch_to_nohz - switch to nohz mode */ static void tick_nohz_switch_to_nohz(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t next; if (!tick_nohz_enabled) return; if (tick_switch_to_oneshot(tick_nohz_handler)) return; /* * Recycle the hrtimer in ts, so we can share the * hrtimer_forward with the highres code. */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); /* Get the next period */ next = tick_init_jiffy_update(); hrtimer_set_expires(&ts->sched_timer, next); hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } static inline void tick_nohz_irq_enter(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now; if (!ts->idle_active && !ts->tick_stopped) return; now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(ts, now); /* * If all CPUs are idle. We may need to update a stale jiffies value. * Note nohz_full is a special case: a timekeeper is guaranteed to stay * alive but it might be busy looping with interrupts disabled in some * rare case (typically stop machine). So we must make sure we have a * last resort. */ if (ts->tick_stopped) tick_nohz_update_jiffies(now); } #else static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } #endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter to notify about the possible interruption of idle() */ void tick_irq_enter(void) { tick_check_oneshot_broadcast_this_cpu(); tick_nohz_irq_enter(); } /* * High resolution timer specific code */ #ifdef CONFIG_HIGH_RES_TIMERS /* * We rearm the timer until we get disabled by the idle code. * Called with interrupts disabled. */ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) { struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); tick_sched_do_timer(ts, now); /* * Do not call, when we are not in irq context and have * no valid regs pointer */ if (regs) tick_sched_handle(ts, regs); else ts->next_tick = 0; /* No need to reprogram if we are in idle or full dynticks mode */ if (unlikely(ts->tick_stopped)) return HRTIMER_NORESTART; hrtimer_forward(timer, now, TICK_NSEC); return HRTIMER_RESTART; } static int sched_skew_tick; static int __init skew_tick(char *str) { get_option(&str, &sched_skew_tick); return 0; } early_param("skew_tick", skew_tick); /** * tick_setup_sched_timer - setup the tick emulation timer */ void tick_setup_sched_timer(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); ktime_t now = ktime_get(); /* * Emulate tick processing via per-CPU hrtimers: */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); ts->sched_timer.function = tick_sched_timer; /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); /* Offset the tick to avert jiffies_lock contention. */ if (sched_skew_tick) { u64 offset = TICK_NSEC >> 1; do_div(offset, num_possible_cpus()); offset *= smp_processor_id(); hrtimer_add_expires_ns(&ts->sched_timer, offset); } hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); } #endif /* HIGH_RES_TIMERS */ #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); # ifdef CONFIG_HIGH_RES_TIMERS if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); # endif memset(ts, 0, sizeof(*ts)); } #endif /* * Async notification about clocksource changes */ void tick_clock_notify(void) { int cpu; for_each_possible_cpu(cpu) set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); } /* * Async notification about clock event changes */ void tick_oneshot_notify(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); set_bit(0, &ts->check_clocks); } /* * Check, if a change happened, which makes oneshot possible. * * Called cyclic from the hrtimer softirq (driven by the timer * softirq) allow_nohz signals, that we can switch into low-res nohz * mode, because high resolution timers are disabled (either compile * or runtime). Called with interrupts disabled. */ int tick_check_oneshot_change(int allow_nohz) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); if (!test_and_clear_bit(0, &ts->check_clocks)) return 0; if (ts->nohz_mode != NOHZ_MODE_INACTIVE) return 0; if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) return 0; if (!allow_nohz) return 1; tick_nohz_switch_to_nohz(); return 0; }
1 1 11 1 1 1 1 1 1 1 1 1 5 5 5 1 11 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 6 6 11 11 5 11 11 10 5 5 5 5 5 17 17 20 20 20 17 6 6 6 5 19 20 140 140 140 140 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 24 24 6 6 162 6 162 18 15 15 15 15 4 15 4 15 15 15 4 12 12 12 6 6 12 1 12 4 3 1 4 1 3 3 3 3 3 3 3 3 3 3 8 5 6 6 1 5 5 4 5 5 8 4 2 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <net/switchdev.h> #include "br_private.h" #include "br_private_tunnel.h" static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid); static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct net_bridge_vlan *vle = ptr; u16 vid = *(u16 *)arg->key; return vle->vid != vid; } static const struct rhashtable_params br_vlan_rht_params = { .head_offset = offsetof(struct net_bridge_vlan, vnode), .key_offset = offsetof(struct net_bridge_vlan, vid), .key_len = sizeof(u16), .nelem_hint = 3, .max_size = VLAN_N_VID, .obj_cmpfn = br_vlan_cmp, .automatic_shrinking = true, }; static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid) { return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params); } static void __vlan_add_pvid(struct net_bridge_vlan_group *vg, const struct net_bridge_vlan *v) { if (vg->pvid == v->vid) return; smp_wmb(); br_vlan_set_pvid_state(vg, v->state); vg->pvid = v->vid; } static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid) { if (vg->pvid != vid) return; smp_wmb(); vg->pvid = 0; } /* Update the BRIDGE_VLAN_INFO_PVID and BRIDGE_VLAN_INFO_UNTAGGED flags of @v. * If @commit is false, return just whether the BRIDGE_VLAN_INFO_PVID and * BRIDGE_VLAN_INFO_UNTAGGED bits of @flags would produce any change onto @v. */ static bool __vlan_flags_update(struct net_bridge_vlan *v, u16 flags, bool commit) { struct net_bridge_vlan_group *vg; bool change; if (br_vlan_is_master(v)) vg = br_vlan_group(v->br); else vg = nbp_vlan_group(v->port); /* check if anything would be changed on commit */ change = !!(flags & BRIDGE_VLAN_INFO_PVID) == !!(vg->pvid != v->vid) || ((flags ^ v->flags) & BRIDGE_VLAN_INFO_UNTAGGED); if (!commit) goto out; if (flags & BRIDGE_VLAN_INFO_PVID) __vlan_add_pvid(vg, v); else __vlan_delete_pvid(vg, v->vid); if (flags & BRIDGE_VLAN_INFO_UNTAGGED) v->flags |= BRIDGE_VLAN_INFO_UNTAGGED; else v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED; out: return change; } static bool __vlan_flags_would_change(struct net_bridge_vlan *v, u16 flags) { return __vlan_flags_update(v, flags, false); } static void __vlan_flags_commit(struct net_bridge_vlan *v, u16 flags) { __vlan_flags_update(v, flags, true); } static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br, struct net_bridge_vlan *v, u16 flags, struct netlink_ext_ack *extack) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q add. */ err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack); if (err == -EOPNOTSUPP) return vlan_vid_add(dev, br->vlan_proto, v->vid); v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV; return err; } static void __vlan_add_list(struct net_bridge_vlan *v) { struct net_bridge_vlan_group *vg; struct list_head *headp, *hpos; struct net_bridge_vlan *vent; if (br_vlan_is_master(v)) vg = br_vlan_group(v->br); else vg = nbp_vlan_group(v->port); headp = &vg->vlan_list; list_for_each_prev(hpos, headp) { vent = list_entry(hpos, struct net_bridge_vlan, vlist); if (v->vid >= vent->vid) break; } list_add_rcu(&v->vlist, hpos); } static void __vlan_del_list(struct net_bridge_vlan *v) { list_del_rcu(&v->vlist); } static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br, const struct net_bridge_vlan *v) { int err; /* Try switchdev op first. In case it is not supported, fallback to * 8021q del. */ err = br_switchdev_port_vlan_del(dev, v->vid); if (!(v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)) vlan_vid_del(dev, br->vlan_proto, v->vid); return err == -EOPNOTSUPP ? 0 : err; } /* Returns a master vlan, if it didn't exist it gets created. In all cases * a reference is taken to the master vlan before returning. */ static struct net_bridge_vlan * br_vlan_get_master(struct net_bridge *br, u16 vid, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *masterv; vg = br_vlan_group(br); masterv = br_vlan_find(vg, vid); if (!masterv) { bool changed; /* missing global ctx, create it now */ if (br_vlan_add(br, vid, 0, &changed, extack)) return NULL; masterv = br_vlan_find(vg, vid); if (WARN_ON(!masterv)) return NULL; refcount_set(&masterv->refcnt, 1); return masterv; } refcount_inc(&masterv->refcnt); return masterv; } static void br_master_vlan_rcu_free(struct rcu_head *rcu) { struct net_bridge_vlan *v; v = container_of(rcu, struct net_bridge_vlan, rcu); WARN_ON(!br_vlan_is_master(v)); free_percpu(v->stats); v->stats = NULL; kfree(v); } static void br_vlan_put_master(struct net_bridge_vlan *masterv) { struct net_bridge_vlan_group *vg; if (!br_vlan_is_master(masterv)) return; vg = br_vlan_group(masterv->br); if (refcount_dec_and_test(&masterv->refcnt)) { rhashtable_remove_fast(&vg->vlan_hash, &masterv->vnode, br_vlan_rht_params); __vlan_del_list(masterv); br_multicast_toggle_one_vlan(masterv, false); br_multicast_ctx_deinit(&masterv->br_mcast_ctx); call_rcu(&masterv->rcu, br_master_vlan_rcu_free); } } static void nbp_vlan_rcu_free(struct rcu_head *rcu) { struct net_bridge_vlan *v; v = container_of(rcu, struct net_bridge_vlan, rcu); WARN_ON(br_vlan_is_master(v)); /* if we had per-port stats configured then free them here */ if (v->priv_flags & BR_VLFLAG_PER_PORT_STATS) free_percpu(v->stats); v->stats = NULL; kfree(v); } static void br_vlan_init_state(struct net_bridge_vlan *v) { struct net_bridge *br; if (br_vlan_is_master(v)) br = v->br; else br = v->port->br; if (br_opt_get(br, BROPT_MST_ENABLED)) { br_mst_vlan_init_state(v); return; } v->state = BR_STATE_FORWARDING; v->msti = 0; } /* This is the shared VLAN add function which works for both ports and bridge * devices. There are four possible calls to this function in terms of the * vlan entry type: * 1. vlan is being added on a port (no master flags, global entry exists) * 2. vlan is being added on a bridge (both master and brentry flags) * 3. vlan is being added on a port, but a global entry didn't exist which * is being created right now (master flag set, brentry flag unset), the * global entry is used for global per-vlan features, but not for filtering * 4. same as 3 but with both master and brentry flags set so the entry * will be used for filtering in both the port and the bridge */ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, struct netlink_ext_ack *extack) { struct net_bridge_vlan *masterv = NULL; struct net_bridge_port *p = NULL; struct net_bridge_vlan_group *vg; struct net_device *dev; struct net_bridge *br; int err; if (br_vlan_is_master(v)) { br = v->br; dev = br->dev; vg = br_vlan_group(br); } else { p = v->port; br = p->br; dev = p->dev; vg = nbp_vlan_group(p); } if (p) { /* Add VLAN to the device filter if it is supported. * This ensures tagged traffic enters the bridge when * promiscuous mode is disabled by br_manage_promisc(). */ err = __vlan_vid_add(dev, br, v, flags, extack); if (err) goto out; /* need to work on the master vlan too */ if (flags & BRIDGE_VLAN_INFO_MASTER) { bool changed; err = br_vlan_add(br, v->vid, flags | BRIDGE_VLAN_INFO_BRENTRY, &changed, extack); if (err) goto out_filt; if (changed) br_vlan_notify(br, NULL, v->vid, 0, RTM_NEWVLAN); } masterv = br_vlan_get_master(br, v->vid, extack); if (!masterv) { err = -ENOMEM; goto out_filt; } v->brvlan = masterv; if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) { v->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!v->stats) { err = -ENOMEM; goto out_filt; } v->priv_flags |= BR_VLFLAG_PER_PORT_STATS; } else { v->stats = masterv->stats; } br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx); } else { if (br_vlan_should_use(v)) { err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack); if (err && err != -EOPNOTSUPP) goto out; } br_multicast_ctx_init(br, v, &v->br_mcast_ctx); v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED; } /* Add the dev mac and count the vlan only if it's usable */ if (br_vlan_should_use(v)) { err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); if (err) { br_err(br, "failed insert local address into bridge forwarding table\n"); goto out_filt; } vg->num_vlans++; } /* set the state before publishing */ br_vlan_init_state(v); err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); if (err) goto out_fdb_insert; __vlan_add_list(v); __vlan_flags_commit(v, flags); br_multicast_toggle_one_vlan(v, true); if (p) nbp_vlan_set_vlan_dev_state(p, v->vid); out: return err; out_fdb_insert: if (br_vlan_should_use(v)) { br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); vg->num_vlans--; } out_filt: if (p) { __vlan_vid_del(dev, br, v); if (masterv) { if (v->stats && masterv->stats != v->stats) free_percpu(v->stats); v->stats = NULL; br_vlan_put_master(masterv); v->brvlan = NULL; } } else { br_switchdev_port_vlan_del(dev, v->vid); } goto out; } static int __vlan_del(struct net_bridge_vlan *v) { struct net_bridge_vlan *masterv = v; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; int err = 0; if (br_vlan_is_master(v)) { vg = br_vlan_group(v->br); } else { p = v->port; vg = nbp_vlan_group(v->port); masterv = v->brvlan; } __vlan_delete_pvid(vg, v->vid); if (p) { err = __vlan_vid_del(p->dev, p->br, v); if (err) goto out; } else { err = br_switchdev_port_vlan_del(v->br->dev, v->vid); if (err && err != -EOPNOTSUPP) goto out; err = 0; } if (br_vlan_should_use(v)) { v->flags &= ~BRIDGE_VLAN_INFO_BRENTRY; vg->num_vlans--; } if (masterv != v) { vlan_tunnel_info_del(vg, v); rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); __vlan_del_list(v); nbp_vlan_set_vlan_dev_state(p, v->vid); br_multicast_toggle_one_vlan(v, false); br_multicast_port_ctx_deinit(&v->port_mcast_ctx); call_rcu(&v->rcu, nbp_vlan_rcu_free); } br_vlan_put_master(masterv); out: return err; } static void __vlan_group_free(struct net_bridge_vlan_group *vg) { WARN_ON(!list_empty(&vg->vlan_list)); rhashtable_destroy(&vg->vlan_hash); vlan_tunnel_deinit(vg); kfree(vg); } static void __vlan_flush(const struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *vlan, *tmp; u16 v_start = 0, v_end = 0; int err; __vlan_delete_pvid(vg, vg->pvid); list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) { /* take care of disjoint ranges */ if (!v_start) { v_start = vlan->vid; } else if (vlan->vid - v_end != 1) { /* found range end, notify and start next one */ br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); v_start = vlan->vid; } v_end = vlan->vid; err = __vlan_del(vlan); if (err) { br_err(br, "port %u(%s) failed to delete vlan %d: %pe\n", (unsigned int) p->port_no, p->dev->name, vlan->vid, ERR_PTR(err)); } } /* notify about the last/whole vlan range */ if (v_start) br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN); } struct sk_buff *br_handle_vlan(struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan_group *vg, struct sk_buff *skb) { struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; u16 vid; /* If this packet was not filtered at input, let it pass */ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) goto out; /* At this point, we know that the frame was filtered and contains * a valid vlan id. If the vlan id has untagged flag set, * send untagged; otherwise, send tagged. */ br_vlan_get_tag(skb, &vid); v = br_vlan_find(vg, vid); /* Vlan entry must be configured at this point. The * only exception is the bridge is set in promisc mode and the * packet is destined for the bridge device. In this case * pass the packet as is. */ if (!v || !br_vlan_should_use(v)) { if ((br->dev->flags & IFF_PROMISC) && skb->dev == br->dev) { goto out; } else { kfree_skb(skb); return NULL; } } if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { stats = this_cpu_ptr(v->stats); u64_stats_update_begin(&stats->syncp); u64_stats_add(&stats->tx_bytes, skb->len); u64_stats_inc(&stats->tx_packets); u64_stats_update_end(&stats->syncp); } /* If the skb will be sent using forwarding offload, the assumption is * that the switchdev will inject the packet into hardware together * with the bridge VLAN, so that it can be forwarded according to that * VLAN. The switchdev should deal with popping the VLAN header in * hardware on each egress port as appropriate. So only strip the VLAN * header if forwarding offload is not being used. */ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED && !br_switchdev_frame_uses_tx_fwd_offload(skb)) __vlan_hwaccel_clear_tag(skb); if (p && (p->flags & BR_VLAN_TUNNEL) && br_handle_egress_vlan_tunnel(skb, v)) { kfree_skb(skb); return NULL; } out: return skb; } /* Called under RCU */ static bool __allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, u8 *state, struct net_bridge_vlan **vlan) { struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; bool tagged; BR_INPUT_SKB_CB(skb)->vlan_filtered = true; /* If vlan tx offload is disabled on bridge device and frame was * sent from vlan device on the bridge device, it does not have * HW accelerated vlan tag. */ if (unlikely(!skb_vlan_tag_present(skb) && skb->protocol == br->vlan_proto)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) return false; } if (!br_vlan_get_tag(skb, vid)) { /* Tagged frame */ if (skb->vlan_proto != br->vlan_proto) { /* Protocol-mismatch, empty out vlan_tci for new tag */ skb_push(skb, ETH_HLEN); skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); if (unlikely(!skb)) return false; skb_pull(skb, ETH_HLEN); skb_reset_mac_len(skb); *vid = 0; tagged = false; } else { tagged = true; } } else { /* Untagged frame */ tagged = false; } if (!*vid) { u16 pvid = br_get_pvid(vg); /* Frame had a tag with VID 0 or did not have a tag. * See if pvid is set on this port. That tells us which * vlan untagged or priority-tagged traffic belongs to. */ if (!pvid) goto drop; /* PVID is set on this port. Any untagged or priority-tagged * ingress frame is considered to belong to this vlan. */ *vid = pvid; if (likely(!tagged)) /* Untagged Frame. */ __vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid); else /* Priority-tagged Frame. * At this point, we know that skb->vlan_tci VID * field was 0. * We update only VID field and preserve PCP field. */ skb->vlan_tci |= pvid; /* if snooping and stats are disabled we can avoid the lookup */ if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_pvid_state(vg); if (!br_vlan_state_allowed(*state, true)) goto drop; } return true; } } v = br_vlan_find(vg, *vid); if (!v || !br_vlan_should_use(v)) goto drop; if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_state(v); if (!br_vlan_state_allowed(*state, true)) goto drop; } if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { stats = this_cpu_ptr(v->stats); u64_stats_update_begin(&stats->syncp); u64_stats_add(&stats->rx_bytes, skb->len); u64_stats_inc(&stats->rx_packets); u64_stats_update_end(&stats->syncp); } *vlan = v; return true; drop: kfree_skb(skb); return false; } bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, u8 *state, struct net_bridge_vlan **vlan) { /* If VLAN filtering is disabled on the bridge, all packets are * permitted. */ *vlan = NULL; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) { BR_INPUT_SKB_CB(skb)->vlan_filtered = false; return true; } return __allowed_ingress(br, vg, skb, vid, state, vlan); } /* Called under RCU. */ bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb) { const struct net_bridge_vlan *v; u16 vid; /* If this packet was not filtered at input, let it pass */ if (!BR_INPUT_SKB_CB(skb)->vlan_filtered) return true; br_vlan_get_tag(skb, &vid); v = br_vlan_find(vg, vid); if (v && br_vlan_should_use(v) && br_vlan_state_allowed(br_vlan_get_state(v), false)) return true; return false; } /* Called under RCU */ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid) { struct net_bridge_vlan_group *vg; struct net_bridge *br = p->br; struct net_bridge_vlan *v; /* If filtering was disabled at input, let it pass. */ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return true; vg = nbp_vlan_group_rcu(p); if (!vg || !vg->num_vlans) return false; if (!br_vlan_get_tag(skb, vid) && skb->vlan_proto != br->vlan_proto) *vid = 0; if (!*vid) { *vid = br_get_pvid(vg); if (!*vid || !br_vlan_state_allowed(br_vlan_get_pvid_state(vg), true)) return false; return true; } v = br_vlan_find(vg, *vid); if (v && br_vlan_state_allowed(br_vlan_get_state(v), true)) return true; return false; } static int br_vlan_add_existing(struct net_bridge *br, struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan, u16 flags, bool *changed, struct netlink_ext_ack *extack) { bool would_change = __vlan_flags_would_change(vlan, flags); bool becomes_brentry = false; int err; if (!br_vlan_is_brentry(vlan)) { /* Trying to change flags of non-existent bridge vlan */ if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) return -EINVAL; becomes_brentry = true; } /* Master VLANs that aren't brentries weren't notified before, * time to notify them now. */ if (becomes_brentry || would_change) { err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags, would_change, extack); if (err && err != -EOPNOTSUPP) return err; } if (becomes_brentry) { /* It was only kept for port vlans, now make it real */ err = br_fdb_add_local(br, NULL, br->dev->dev_addr, vlan->vid); if (err) { br_err(br, "failed to insert local address into bridge forwarding table\n"); goto err_fdb_insert; } refcount_inc(&vlan->refcnt); vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY; vg->num_vlans++; *changed = true; br_multicast_toggle_one_vlan(vlan, true); } __vlan_flags_commit(vlan, flags); if (would_change) *changed = true; return 0; err_fdb_insert: br_switchdev_port_vlan_del(br->dev, vlan->vid); return err; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. * changed must be true only if the vlan was created or updated */ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; int ret; ASSERT_RTNL(); *changed = false; vg = br_vlan_group(br); vlan = br_vlan_find(vg, vid); if (vlan) return br_vlan_add_existing(br, vg, vlan, flags, changed, extack); vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); if (!vlan) return -ENOMEM; vlan->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!vlan->stats) { kfree(vlan); return -ENOMEM; } vlan->vid = vid; vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER; vlan->flags &= ~BRIDGE_VLAN_INFO_PVID; vlan->br = br; if (flags & BRIDGE_VLAN_INFO_BRENTRY) refcount_set(&vlan->refcnt, 1); ret = __vlan_add(vlan, flags, extack); if (ret) { free_percpu(vlan->stats); kfree(vlan); } else { *changed = true; } return ret; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int br_vlan_delete(struct net_bridge *br, u16 vid) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; ASSERT_RTNL(); vg = br_vlan_group(br); v = br_vlan_find(vg, vid); if (!v || !br_vlan_is_brentry(v)) return -ENOENT; br_fdb_find_delete_local(br, NULL, br->dev->dev_addr, vid); br_fdb_delete_by_port(br, NULL, vid, 0); vlan_tunnel_info_del(vg, v); return __vlan_del(v); } void br_vlan_flush(struct net_bridge *br) { struct net_bridge_vlan_group *vg; ASSERT_RTNL(); vg = br_vlan_group(br); __vlan_flush(br, NULL, vg); RCU_INIT_POINTER(br->vlgrp, NULL); synchronize_rcu(); __vlan_group_free(vg); } struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid) { if (!vg) return NULL; return br_vlan_lookup(&vg->vlan_hash, vid); } /* Must be protected by RTNL. */ static void recalculate_group_addr(struct net_bridge *br) { if (br_opt_get(br, BROPT_GROUP_ADDR_SET)) return; spin_lock_bh(&br->lock); if (!br_opt_get(br, BROPT_VLAN_ENABLED) || br->vlan_proto == htons(ETH_P_8021Q)) { /* Bridge Group Address */ br->group_addr[5] = 0x00; } else { /* vlan_enabled && ETH_P_8021AD */ /* Provider Bridge Group Address */ br->group_addr[5] = 0x08; } spin_unlock_bh(&br->lock); } /* Must be protected by RTNL. */ void br_recalculate_fwd_mask(struct net_bridge *br) { if (!br_opt_get(br, BROPT_VLAN_ENABLED) || br->vlan_proto == htons(ETH_P_8021Q)) br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT; else /* vlan_enabled && ETH_P_8021AD */ br->group_fwd_mask_required = BR_GROUPFWD_8021AD & ~(1u << br->group_addr[5]); } int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.vlan_filtering = val, }; int err; if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val) return 0; br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val); err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) { br_opt_toggle(br, BROPT_VLAN_ENABLED, !val); return err; } br_manage_promisc(br); recalculate_group_addr(br); br_recalculate_fwd_mask(br); if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n"); br_multicast_toggle_vlan_snooping(br, false, NULL); } return 0; } bool br_vlan_enabled(const struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); return br_opt_get(br, BROPT_VLAN_ENABLED); } EXPORT_SYMBOL_GPL(br_vlan_enabled); int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto) { struct net_bridge *br = netdev_priv(dev); *p_proto = ntohs(br->vlan_proto); return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_proto); int __br_vlan_set_proto(struct net_bridge *br, __be16 proto, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.vlan_protocol = ntohs(proto), }; int err = 0; struct net_bridge_port *p; struct net_bridge_vlan *vlan; struct net_bridge_vlan_group *vg; __be16 oldproto = br->vlan_proto; if (br->vlan_proto == proto) return 0; err = switchdev_port_attr_set(br->dev, &attr, extack); if (err && err != -EOPNOTSUPP) return err; /* Add VLANs for the new proto to the device filter. */ list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; err = vlan_vid_add(p->dev, proto, vlan->vid); if (err) goto err_filt; } } br->vlan_proto = proto; recalculate_group_addr(br); br_recalculate_fwd_mask(br); /* Delete VLANs for the old proto from the device filter. */ list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; vlan_vid_del(p->dev, oldproto, vlan->vid); } } return 0; err_filt: attr.u.vlan_protocol = ntohs(oldproto); switchdev_port_attr_set(br->dev, &attr, NULL); list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; vlan_vid_del(p->dev, proto, vlan->vid); } list_for_each_entry_continue_reverse(p, &br->port_list, list) { vg = nbp_vlan_group(p); list_for_each_entry(vlan, &vg->vlan_list, vlist) { if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) continue; vlan_vid_del(p->dev, proto, vlan->vid); } } return err; } int br_vlan_set_proto(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { if (!eth_type_vlan(htons(val))) return -EPROTONOSUPPORT; return __br_vlan_set_proto(br, htons(val), extack); } int br_vlan_set_stats(struct net_bridge *br, unsigned long val) { switch (val) { case 0: case 1: br_opt_toggle(br, BROPT_VLAN_STATS_ENABLED, !!val); break; default: return -EINVAL; } return 0; } int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val) { struct net_bridge_port *p; /* allow to change the option if there are no port vlans configured */ list_for_each_entry(p, &br->port_list, list) { struct net_bridge_vlan_group *vg = nbp_vlan_group(p); if (vg->num_vlans) return -EBUSY; } switch (val) { case 0: case 1: br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val); break; default: return -EINVAL; } return 0; } static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid) { struct net_bridge_vlan *v; if (vid != vg->pvid) return false; v = br_vlan_lookup(&vg->vlan_hash, vid); if (v && br_vlan_should_use(v) && (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) return true; return false; } static void br_vlan_disable_default_pvid(struct net_bridge *br) { struct net_bridge_port *p; u16 pvid = br->default_pvid; /* Disable default_pvid on all ports where it is still * configured. */ if (vlan_default_pvid(br_vlan_group(br), pvid)) { if (!br_vlan_delete(br, pvid)) br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); } list_for_each_entry(p, &br->port_list, list) { if (vlan_default_pvid(nbp_vlan_group(p), pvid) && !nbp_vlan_delete(p, pvid)) br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } br->default_pvid = 0; } int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid, struct netlink_ext_ack *extack) { const struct net_bridge_vlan *pvent; struct net_bridge_vlan_group *vg; struct net_bridge_port *p; unsigned long *changed; bool vlchange; u16 old_pvid; int err = 0; if (!pvid) { br_vlan_disable_default_pvid(br); return 0; } changed = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL); if (!changed) return -ENOMEM; old_pvid = br->default_pvid; /* Update default_pvid config only if we do not conflict with * user configuration. */ vg = br_vlan_group(br); pvent = br_vlan_find(vg, pvid); if ((!old_pvid || vlan_default_pvid(vg, old_pvid)) && (!pvent || !br_vlan_should_use(pvent))) { err = br_vlan_add(br, pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &vlchange, extack); if (err) goto out; if (br_vlan_delete(br, old_pvid)) br_vlan_notify(br, NULL, old_pvid, 0, RTM_DELVLAN); br_vlan_notify(br, NULL, pvid, 0, RTM_NEWVLAN); __set_bit(0, changed); } list_for_each_entry(p, &br->port_list, list) { /* Update default_pvid config only if we do not conflict with * user configuration. */ vg = nbp_vlan_group(p); if ((old_pvid && !vlan_default_pvid(vg, old_pvid)) || br_vlan_find(vg, pvid)) continue; err = nbp_vlan_add(p, pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &vlchange, extack); if (err) goto err_port; if (nbp_vlan_delete(p, old_pvid)) br_vlan_notify(br, p, old_pvid, 0, RTM_DELVLAN); br_vlan_notify(p->br, p, pvid, 0, RTM_NEWVLAN); __set_bit(p->port_no, changed); } br->default_pvid = pvid; out: bitmap_free(changed); return err; err_port: list_for_each_entry_continue_reverse(p, &br->port_list, list) { if (!test_bit(p->port_no, changed)) continue; if (old_pvid) { nbp_vlan_add(p, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &vlchange, NULL); br_vlan_notify(p->br, p, old_pvid, 0, RTM_NEWVLAN); } nbp_vlan_delete(p, pvid); br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN); } if (test_bit(0, changed)) { if (old_pvid) { br_vlan_add(br, old_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &vlchange, NULL); br_vlan_notify(br, NULL, old_pvid, 0, RTM_NEWVLAN); } br_vlan_delete(br, pvid); br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN); } goto out; } int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { u16 pvid = val; int err = 0; if (val >= VLAN_VID_MASK) return -EINVAL; if (pvid == br->default_pvid) goto out; /* Only allow default pvid change when filtering is disabled */ if (br_opt_get(br, BROPT_VLAN_ENABLED)) { pr_info_once("Please disable vlan filtering to change default_pvid\n"); err = -EPERM; goto out; } err = __br_vlan_set_default_pvid(br, pvid, extack); out: return err; } int br_vlan_init(struct net_bridge *br) { struct net_bridge_vlan_group *vg; int ret = -ENOMEM; vg = kzalloc(sizeof(*vg), GFP_KERNEL); if (!vg) goto out; ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); if (ret) goto err_rhtbl; ret = vlan_tunnel_init(vg); if (ret) goto err_tunnel_init; INIT_LIST_HEAD(&vg->vlan_list); br->vlan_proto = htons(ETH_P_8021Q); br->default_pvid = 1; rcu_assign_pointer(br->vlgrp, vg); out: return ret; err_tunnel_init: rhashtable_destroy(&vg->vlan_hash); err_rhtbl: kfree(vg); goto out; } int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack) { struct switchdev_attr attr = { .orig_dev = p->br->dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP, .u.vlan_filtering = br_opt_get(p->br, BROPT_VLAN_ENABLED), }; struct net_bridge_vlan_group *vg; int ret = -ENOMEM; vg = kzalloc(sizeof(struct net_bridge_vlan_group), GFP_KERNEL); if (!vg) goto out; ret = switchdev_port_attr_set(p->dev, &attr, extack); if (ret && ret != -EOPNOTSUPP) goto err_vlan_enabled; ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params); if (ret) goto err_rhtbl; ret = vlan_tunnel_init(vg); if (ret) goto err_tunnel_init; INIT_LIST_HEAD(&vg->vlan_list); rcu_assign_pointer(p->vlgrp, vg); if (p->br->default_pvid) { bool changed; ret = nbp_vlan_add(p, p->br->default_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED, &changed, extack); if (ret) goto err_vlan_add; br_vlan_notify(p->br, p, p->br->default_pvid, 0, RTM_NEWVLAN); } out: return ret; err_vlan_add: RCU_INIT_POINTER(p->vlgrp, NULL); synchronize_rcu(); vlan_tunnel_deinit(vg); err_tunnel_init: rhashtable_destroy(&vg->vlan_hash); err_rhtbl: err_vlan_enabled: kfree(vg); goto out; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. * changed must be true only if the vlan was created or updated */ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags, bool *changed, struct netlink_ext_ack *extack) { struct net_bridge_vlan *vlan; int ret; ASSERT_RTNL(); *changed = false; vlan = br_vlan_find(nbp_vlan_group(port), vid); if (vlan) { bool would_change = __vlan_flags_would_change(vlan, flags); if (would_change) { /* Pass the flags to the hardware bridge */ ret = br_switchdev_port_vlan_add(port->dev, vid, flags, true, extack); if (ret && ret != -EOPNOTSUPP) return ret; } __vlan_flags_commit(vlan, flags); *changed = would_change; return 0; } vlan = kzalloc(sizeof(*vlan), GFP_KERNEL); if (!vlan) return -ENOMEM; vlan->vid = vid; vlan->port = port; ret = __vlan_add(vlan, flags, extack); if (ret) kfree(vlan); else *changed = true; return ret; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid) { struct net_bridge_vlan *v; ASSERT_RTNL(); v = br_vlan_find(nbp_vlan_group(port), vid); if (!v) return -ENOENT; br_fdb_find_delete_local(port->br, port, port->dev->dev_addr, vid); br_fdb_delete_by_port(port->br, port, vid, 0); return __vlan_del(v); } void nbp_vlan_flush(struct net_bridge_port *port) { struct net_bridge_vlan_group *vg; ASSERT_RTNL(); vg = nbp_vlan_group(port); __vlan_flush(port->br, port, vg); RCU_INIT_POINTER(port->vlgrp, NULL); synchronize_rcu(); __vlan_group_free(vg); } void br_vlan_get_stats(const struct net_bridge_vlan *v, struct pcpu_sw_netstats *stats) { int i; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(i) { u64 rxpackets, rxbytes, txpackets, txbytes; struct pcpu_sw_netstats *cpu_stats; unsigned int start; cpu_stats = per_cpu_ptr(v->stats, i); do { start = u64_stats_fetch_begin(&cpu_stats->syncp); rxpackets = u64_stats_read(&cpu_stats->rx_packets); rxbytes = u64_stats_read(&cpu_stats->rx_bytes); txbytes = u64_stats_read(&cpu_stats->tx_bytes); txpackets = u64_stats_read(&cpu_stats->tx_packets); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->rx_packets, rxpackets); u64_stats_add(&stats->rx_bytes, rxbytes); u64_stats_add(&stats->tx_bytes, txbytes); u64_stats_add(&stats->tx_packets, txpackets); } } int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p; ASSERT_RTNL(); p = br_port_get_check_rtnl(dev); if (p) vg = nbp_vlan_group(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group(netdev_priv(dev)); else return -EINVAL; *p_pvid = br_get_pvid(vg); return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_pvid); int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid) { struct net_bridge_vlan_group *vg; struct net_bridge_port *p; p = br_port_get_check_rcu(dev); if (p) vg = nbp_vlan_group_rcu(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group_rcu(netdev_priv(dev)); else return -EINVAL; *p_pvid = br_get_pvid(vg); return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu); void br_vlan_fill_forward_path_pvid(struct net_bridge *br, struct net_device_path_ctx *ctx, struct net_device_path *path) { struct net_bridge_vlan_group *vg; int idx = ctx->num_vlans - 1; u16 vid; path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return; vg = br_vlan_group(br); if (idx >= 0 && ctx->vlan[idx].proto == br->vlan_proto) { vid = ctx->vlan[idx].id; } else { path->bridge.vlan_mode = DEV_PATH_BR_VLAN_TAG; vid = br_get_pvid(vg); } path->bridge.vlan_id = vid; path->bridge.vlan_proto = br->vlan_proto; } int br_vlan_fill_forward_path_mode(struct net_bridge *br, struct net_bridge_port *dst, struct net_device_path *path) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return 0; vg = nbp_vlan_group_rcu(dst); v = br_vlan_find(vg, path->bridge.vlan_id); if (!v || !br_vlan_should_use(v)) return -EINVAL; if (!(v->flags & BRIDGE_VLAN_INFO_UNTAGGED)) return 0; if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG) path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP; else if (v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV) path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG_HW; else path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG; return 0; } int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge_port *p; ASSERT_RTNL(); p = br_port_get_check_rtnl(dev); if (p) vg = nbp_vlan_group(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group(netdev_priv(dev)); else return -EINVAL; v = br_vlan_find(vg, vid); if (!v) return -ENOENT; p_vinfo->vid = vid; p_vinfo->flags = v->flags; if (vid == br_get_pvid(vg)) p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_info); int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; struct net_bridge_port *p; p = br_port_get_check_rcu(dev); if (p) vg = nbp_vlan_group_rcu(p); else if (netif_is_bridge_master(dev)) vg = br_vlan_group_rcu(netdev_priv(dev)); else return -EINVAL; v = br_vlan_find(vg, vid); if (!v) return -ENOENT; p_vinfo->vid = vid; p_vinfo->flags = v->flags; if (vid == br_get_pvid(vg)) p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; return 0; } EXPORT_SYMBOL_GPL(br_vlan_get_info_rcu); static int br_vlan_is_bind_vlan_dev(const struct net_device *dev) { return is_vlan_dev(dev) && !!(vlan_dev_priv(dev)->flags & VLAN_FLAG_BRIDGE_BINDING); } static int br_vlan_is_bind_vlan_dev_fn(struct net_device *dev, __always_unused struct netdev_nested_priv *priv) { return br_vlan_is_bind_vlan_dev(dev); } static bool br_vlan_has_upper_bind_vlan_dev(struct net_device *dev) { int found; rcu_read_lock(); found = netdev_walk_all_upper_dev_rcu(dev, br_vlan_is_bind_vlan_dev_fn, NULL); rcu_read_unlock(); return !!found; } struct br_vlan_bind_walk_data { u16 vid; struct net_device *result; }; static int br_vlan_match_bind_vlan_dev_fn(struct net_device *dev, struct netdev_nested_priv *priv) { struct br_vlan_bind_walk_data *data = priv->data; int found = 0; if (br_vlan_is_bind_vlan_dev(dev) && vlan_dev_priv(dev)->vlan_id == data->vid) { data->result = dev; found = 1; } return found; } static struct net_device * br_vlan_get_upper_bind_vlan_dev(struct net_device *dev, u16 vid) { struct br_vlan_bind_walk_data data = { .vid = vid, }; struct netdev_nested_priv priv = { .data = (void *)&data, }; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(dev, br_vlan_match_bind_vlan_dev_fn, &priv); rcu_read_unlock(); return data.result; } static bool br_vlan_is_dev_up(const struct net_device *dev) { return !!(dev->flags & IFF_UP) && netif_oper_up(dev); } static void br_vlan_set_vlan_dev_state(const struct net_bridge *br, struct net_device *vlan_dev) { u16 vid = vlan_dev_priv(vlan_dev)->vlan_id; struct net_bridge_vlan_group *vg; struct net_bridge_port *p; bool has_carrier = false; if (!netif_carrier_ok(br->dev)) { netif_carrier_off(vlan_dev); return; } list_for_each_entry(p, &br->port_list, list) { vg = nbp_vlan_group(p); if (br_vlan_find(vg, vid) && br_vlan_is_dev_up(p->dev)) { has_carrier = true; break; } } if (has_carrier) netif_carrier_on(vlan_dev); else netif_carrier_off(vlan_dev); } static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p) { struct net_bridge_vlan_group *vg = nbp_vlan_group(p); struct net_bridge_vlan *vlan; struct net_device *vlan_dev; list_for_each_entry(vlan, &vg->vlan_list, vlist) { vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vlan->vid); if (vlan_dev) { if (br_vlan_is_dev_up(p->dev)) { if (netif_carrier_ok(p->br->dev)) netif_carrier_on(vlan_dev); } else { br_vlan_set_vlan_dev_state(p->br, vlan_dev); } } } } static void br_vlan_upper_change(struct net_device *dev, struct net_device *upper_dev, bool linking) { struct net_bridge *br = netdev_priv(dev); if (!br_vlan_is_bind_vlan_dev(upper_dev)) return; if (linking) { br_vlan_set_vlan_dev_state(br, upper_dev); br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true); } else { br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, br_vlan_has_upper_bind_vlan_dev(dev)); } } struct br_vlan_link_state_walk_data { struct net_bridge *br; }; static int br_vlan_link_state_change_fn(struct net_device *vlan_dev, struct netdev_nested_priv *priv) { struct br_vlan_link_state_walk_data *data = priv->data; if (br_vlan_is_bind_vlan_dev(vlan_dev)) br_vlan_set_vlan_dev_state(data->br, vlan_dev); return 0; } static void br_vlan_link_state_change(struct net_device *dev, struct net_bridge *br) { struct br_vlan_link_state_walk_data data = { .br = br }; struct netdev_nested_priv priv = { .data = (void *)&data, }; rcu_read_lock(); netdev_walk_all_upper_dev_rcu(dev, br_vlan_link_state_change_fn, &priv); rcu_read_unlock(); } /* Must be protected by RTNL. */ static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid) { struct net_device *vlan_dev; if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) return; vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vid); if (vlan_dev) br_vlan_set_vlan_dev_state(p->br, vlan_dev); } /* Must be protected by RTNL. */ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) { struct netdev_notifier_changeupper_info *info; struct net_bridge *br = netdev_priv(dev); int vlcmd = 0, ret = 0; bool changed = false; switch (event) { case NETDEV_REGISTER: ret = br_vlan_add(br, br->default_pvid, BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED | BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL); vlcmd = RTM_NEWVLAN; break; case NETDEV_UNREGISTER: changed = !br_vlan_delete(br, br->default_pvid); vlcmd = RTM_DELVLAN; break; case NETDEV_CHANGEUPPER: info = ptr; br_vlan_upper_change(dev, info->upper_dev, info->linking); break; case NETDEV_CHANGE: case NETDEV_UP: if (!br_opt_get(br, BROPT_VLAN_BRIDGE_BINDING)) break; br_vlan_link_state_change(dev, br); break; } if (changed) br_vlan_notify(br, NULL, br->default_pvid, 0, vlcmd); return ret; } /* Must be protected by RTNL. */ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) { if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING)) return; switch (event) { case NETDEV_CHANGE: case NETDEV_DOWN: case NETDEV_UP: br_vlan_set_all_vlan_dev_state(p); break; } } static bool br_vlan_stats_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) { struct pcpu_sw_netstats stats; struct nlattr *nest; nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_STATS); if (!nest) return false; br_vlan_get_stats(v, &stats); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_BYTES, u64_stats_read(&stats.rx_bytes), BRIDGE_VLANDB_STATS_PAD) || nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_PACKETS, u64_stats_read(&stats.rx_packets), BRIDGE_VLANDB_STATS_PAD) || nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_BYTES, u64_stats_read(&stats.tx_bytes), BRIDGE_VLANDB_STATS_PAD) || nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_PACKETS, u64_stats_read(&stats.tx_packets), BRIDGE_VLANDB_STATS_PAD)) goto out_err; nla_nest_end(skb, nest); return true; out_err: nla_nest_cancel(skb, nest); return false; } /* v_opts is used to dump the options which must be equal in the whole range */ static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts, const struct net_bridge_port *p, u16 flags, bool dump_stats) { struct bridge_vlan_info info; struct nlattr *nest; nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY); if (!nest) return false; memset(&info, 0, sizeof(info)); info.vid = vid; if (flags & BRIDGE_VLAN_INFO_UNTAGGED) info.flags |= BRIDGE_VLAN_INFO_UNTAGGED; if (flags & BRIDGE_VLAN_INFO_PVID) info.flags |= BRIDGE_VLAN_INFO_PVID; if (nla_put(skb, BRIDGE_VLANDB_ENTRY_INFO, sizeof(info), &info)) goto out_err; if (vid_range && vid < vid_range && !(flags & BRIDGE_VLAN_INFO_PVID) && nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range)) goto out_err; if (v_opts) { if (!br_vlan_opts_fill(skb, v_opts, p)) goto out_err; if (dump_stats && !br_vlan_stats_fill(skb, v_opts)) goto out_err; } nla_nest_end(skb, nest); return true; out_err: nla_nest_cancel(skb, nest); return false; } static size_t rtnl_vlan_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct br_vlan_msg)) + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY */ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_ENTRY_RANGE */ + nla_total_size(sizeof(struct bridge_vlan_info)) /* BRIDGE_VLANDB_ENTRY_INFO */ + br_vlan_opts_nl_size(); /* bridge vlan options */ } void br_vlan_notify(const struct net_bridge *br, const struct net_bridge_port *p, u16 vid, u16 vid_range, int cmd) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v = NULL; struct br_vlan_msg *bvm; struct nlmsghdr *nlh; struct sk_buff *skb; int err = -ENOBUFS; struct net *net; u16 flags = 0; int ifindex; /* right now notifications are done only with rtnl held */ ASSERT_RTNL(); if (p) { ifindex = p->dev->ifindex; vg = nbp_vlan_group(p); net = dev_net(p->dev); } else { ifindex = br->dev->ifindex; vg = br_vlan_group(br); net = dev_net(br->dev); } skb = nlmsg_new(rtnl_vlan_nlmsg_size(), GFP_KERNEL); if (!skb) goto out_err; err = -EMSGSIZE; nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*bvm), 0); if (!nlh) goto out_err; bvm = nlmsg_data(nlh); memset(bvm, 0, sizeof(*bvm)); bvm->family = AF_BRIDGE; bvm->ifindex = ifindex; switch (cmd) { case RTM_NEWVLAN: /* need to find the vlan due to flags/options */ v = br_vlan_find(vg, vid); if (!v || !br_vlan_should_use(v)) goto out_kfree; flags = v->flags; if (br_get_pvid(vg) == v->vid) flags |= BRIDGE_VLAN_INFO_PVID; break; case RTM_DELVLAN: break; default: goto out_kfree; } if (!br_vlan_fill_vids(skb, vid, vid_range, v, p, flags, false)) goto out_err; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL); return; out_err: rtnl_set_sk_err(net, RTNLGRP_BRVLAN, err); out_kfree: kfree_skb(skb); } /* check if v_curr can enter a range ending in range_end */ bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) { return v_curr->vid - range_end->vid == 1 && range_end->flags == v_curr->flags && br_vlan_opts_eq_range(v_curr, range_end); } static int br_vlan_dump_dev(const struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb, u32 dump_flags) { struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL; bool dump_global = !!(dump_flags & BRIDGE_VLANDB_DUMPF_GLOBAL); bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS); struct net_bridge_vlan_group *vg; int idx = 0, s_idx = cb->args[1]; struct nlmsghdr *nlh = NULL; struct net_bridge_port *p; struct br_vlan_msg *bvm; struct net_bridge *br; int err = 0; u16 pvid; if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) return -EINVAL; if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group_rcu(br); p = NULL; } else { /* global options are dumped only for bridge devices */ if (dump_global) return 0; p = br_port_get_rcu(dev); if (WARN_ON(!p)) return -EINVAL; vg = nbp_vlan_group_rcu(p); br = p->br; } if (!vg) return 0; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWVLAN, sizeof(*bvm), NLM_F_MULTI); if (!nlh) return -EMSGSIZE; bvm = nlmsg_data(nlh); memset(bvm, 0, sizeof(*bvm)); bvm->family = PF_BRIDGE; bvm->ifindex = dev->ifindex; pvid = br_get_pvid(vg); /* idx must stay at range's beginning until it is filled in */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { if (!dump_global && !br_vlan_should_use(v)) continue; if (idx < s_idx) { idx++; continue; } if (!range_start) { range_start = v; range_end = v; continue; } if (dump_global) { if (br_vlan_global_opts_can_enter_range(v, range_end)) goto update_end; if (!br_vlan_global_opts_fill(skb, range_start->vid, range_end->vid, range_start)) { err = -EMSGSIZE; break; } /* advance number of filled vlans */ idx += range_end->vid - range_start->vid + 1; range_start = v; } else if (dump_stats || v->vid == pvid || !br_vlan_can_enter_range(v, range_end)) { u16 vlan_flags = br_vlan_flags(range_start, pvid); if (!br_vlan_fill_vids(skb, range_start->vid, range_end->vid, range_start, p, vlan_flags, dump_stats)) { err = -EMSGSIZE; break; } /* advance number of filled vlans */ idx += range_end->vid - range_start->vid + 1; range_start = v; } update_end: range_end = v; } /* err will be 0 and range_start will be set in 3 cases here: * - first vlan (range_start == range_end) * - last vlan (range_start == range_end, not in range) * - last vlan range (range_start != range_end, in range) */ if (!err && range_start) { if (dump_global && !br_vlan_global_opts_fill(skb, range_start->vid, range_end->vid, range_start)) err = -EMSGSIZE; else if (!dump_global && !br_vlan_fill_vids(skb, range_start->vid, range_end->vid, range_start, p, br_vlan_flags(range_start, pvid), dump_stats)) err = -EMSGSIZE; } cb->args[1] = err ? idx : 0; nlmsg_end(skb, nlh); return err; } static const struct nla_policy br_vlan_db_dump_pol[BRIDGE_VLANDB_DUMP_MAX + 1] = { [BRIDGE_VLANDB_DUMP_FLAGS] = { .type = NLA_U32 }, }; static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *dtb[BRIDGE_VLANDB_DUMP_MAX + 1]; int idx = 0, err = 0, s_idx = cb->args[0]; struct net *net = sock_net(skb->sk); struct br_vlan_msg *bvm; struct net_device *dev; u32 dump_flags = 0; err = nlmsg_parse(cb->nlh, sizeof(*bvm), dtb, BRIDGE_VLANDB_DUMP_MAX, br_vlan_db_dump_pol, cb->extack); if (err < 0) return err; bvm = nlmsg_data(cb->nlh); if (dtb[BRIDGE_VLANDB_DUMP_FLAGS]) dump_flags = nla_get_u32(dtb[BRIDGE_VLANDB_DUMP_FLAGS]); rcu_read_lock(); if (bvm->ifindex) { dev = dev_get_by_index_rcu(net, bvm->ifindex); if (!dev) { err = -ENODEV; goto out_err; } err = br_vlan_dump_dev(dev, skb, cb, dump_flags); /* if the dump completed without an error we return 0 here */ if (err != -EMSGSIZE) goto out_err; } else { for_each_netdev_rcu(net, dev) { if (idx < s_idx) goto skip; err = br_vlan_dump_dev(dev, skb, cb, dump_flags); if (err == -EMSGSIZE) break; skip: idx++; } } cb->args[0] = idx; rcu_read_unlock(); return skb->len; out_err: rcu_read_unlock(); return err; } static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = { [BRIDGE_VLANDB_ENTRY_INFO] = NLA_POLICY_EXACT_LEN(sizeof(struct bridge_vlan_info)), [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 }, [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED }, [BRIDGE_VLANDB_ENTRY_MCAST_ROUTER] = { .type = NLA_U8 }, [BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS] = { .type = NLA_REJECT }, [BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, [BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1), }; static int br_vlan_rtm_process_one(struct net_device *dev, const struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { struct bridge_vlan_info *vinfo, vrange_end, *vinfo_last = NULL; struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1]; bool changed = false, skip_processing = false; struct net_bridge_vlan_group *vg; struct net_bridge_port *p = NULL; int err = 0, cmdmap = 0; struct net_bridge *br; if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group(br); } else { p = br_port_get_rtnl(dev); if (WARN_ON(!p)) return -ENODEV; br = p->br; vg = nbp_vlan_group(p); } if (WARN_ON(!vg)) return -ENODEV; err = nla_parse_nested(tb, BRIDGE_VLANDB_ENTRY_MAX, attr, br_vlan_db_policy, extack); if (err) return err; if (!tb[BRIDGE_VLANDB_ENTRY_INFO]) { NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry info"); return -EINVAL; } memset(&vrange_end, 0, sizeof(vrange_end)); vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]); if (vinfo->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN | BRIDGE_VLAN_INFO_RANGE_END)) { NL_SET_ERR_MSG_MOD(extack, "Old-style vlan ranges are not allowed when using RTM vlan calls"); return -EINVAL; } if (!br_vlan_valid_id(vinfo->vid, extack)) return -EINVAL; if (tb[BRIDGE_VLANDB_ENTRY_RANGE]) { vrange_end.vid = nla_get_u16(tb[BRIDGE_VLANDB_ENTRY_RANGE]); /* validate user-provided flags without RANGE_BEGIN */ vrange_end.flags = BRIDGE_VLAN_INFO_RANGE_END | vinfo->flags; vinfo->flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN; /* vinfo_last is the range start, vinfo the range end */ vinfo_last = vinfo; vinfo = &vrange_end; if (!br_vlan_valid_id(vinfo->vid, extack) || !br_vlan_valid_range(vinfo, vinfo_last, extack)) return -EINVAL; } switch (cmd) { case RTM_NEWVLAN: cmdmap = RTM_SETLINK; skip_processing = !!(vinfo->flags & BRIDGE_VLAN_INFO_ONLY_OPTS); break; case RTM_DELVLAN: cmdmap = RTM_DELLINK; break; } if (!skip_processing) { struct bridge_vlan_info *tmp_last = vinfo_last; /* br_process_vlan_info may overwrite vinfo_last */ err = br_process_vlan_info(br, p, cmdmap, vinfo, &tmp_last, &changed, extack); /* notify first if anything changed */ if (changed) br_ifinfo_notify(cmdmap, br, p); if (err) return err; } /* deal with options */ if (cmd == RTM_NEWVLAN) { struct net_bridge_vlan *range_start, *range_end; if (vinfo_last) { range_start = br_vlan_find(vg, vinfo_last->vid); range_end = br_vlan_find(vg, vinfo->vid); } else { range_start = br_vlan_find(vg, vinfo->vid); range_end = range_start; } err = br_vlan_process_options(br, p, range_start, range_end, tb, extack); } return err; } static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct br_vlan_msg *bvm; struct net_device *dev; struct nlattr *attr; int err, vlans = 0; int rem; /* this should validate the header and check for remaining bytes */ err = nlmsg_parse(nlh, sizeof(*bvm), NULL, BRIDGE_VLANDB_MAX, NULL, extack); if (err < 0) return err; bvm = nlmsg_data(nlh); dev = __dev_get_by_index(net, bvm->ifindex); if (!dev) return -ENODEV; if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) { NL_SET_ERR_MSG_MOD(extack, "The device is not a valid bridge or bridge port"); return -EINVAL; } nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) { switch (nla_type(attr)) { case BRIDGE_VLANDB_ENTRY: err = br_vlan_rtm_process_one(dev, attr, nlh->nlmsg_type, extack); break; case BRIDGE_VLANDB_GLOBAL_OPTIONS: err = br_vlan_rtm_process_global_options(dev, attr, nlh->nlmsg_type, extack); break; default: continue; } vlans++; if (err) break; } if (!vlans) { NL_SET_ERR_MSG_MOD(extack, "No vlans found to process"); err = -EINVAL; } return err; } void br_vlan_rtnl_init(void) { rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL, br_vlan_rtm_dump, 0); rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN, br_vlan_rtm_process, NULL, 0); rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELVLAN, br_vlan_rtm_process, NULL, 0); } void br_vlan_rtnl_uninit(void) { rtnl_unregister(PF_BRIDGE, RTM_GETVLAN); rtnl_unregister(PF_BRIDGE, RTM_NEWVLAN); rtnl_unregister(PF_BRIDGE, RTM_DELVLAN); }
73 73 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 // SPDX-License-Identifier: GPL-2.0-only /* * VLAN netlink control interface * * Copyright (c) 2007 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> #include <linux/module.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/rtnetlink.h> #include "vlan.h" static const struct nla_policy vlan_policy[IFLA_VLAN_MAX + 1] = { [IFLA_VLAN_ID] = { .type = NLA_U16 }, [IFLA_VLAN_FLAGS] = { .len = sizeof(struct ifla_vlan_flags) }, [IFLA_VLAN_EGRESS_QOS] = { .type = NLA_NESTED }, [IFLA_VLAN_INGRESS_QOS] = { .type = NLA_NESTED }, [IFLA_VLAN_PROTOCOL] = { .type = NLA_U16 }, }; static const struct nla_policy vlan_map_policy[IFLA_VLAN_QOS_MAX + 1] = { [IFLA_VLAN_QOS_MAPPING] = { .len = sizeof(struct ifla_vlan_qos_mapping) }, }; static inline int vlan_validate_qos_map(struct nlattr *attr) { if (!attr) return 0; return nla_validate_nested_deprecated(attr, IFLA_VLAN_QOS_MAX, vlan_map_policy, NULL); } static int vlan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ifla_vlan_flags *flags; u16 id; int err; if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG_MOD(extack, "Invalid link address"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG_MOD(extack, "Invalid link address"); return -EADDRNOTAVAIL; } } if (!data) { NL_SET_ERR_MSG_MOD(extack, "VLAN properties not specified"); return -EINVAL; } if (data[IFLA_VLAN_PROTOCOL]) { switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) { case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): break; default: NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN protocol"); return -EPROTONOSUPPORT; } } if (data[IFLA_VLAN_ID]) { id = nla_get_u16(data[IFLA_VLAN_ID]); if (id >= VLAN_VID_MASK) { NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN id"); return -ERANGE; } } if (data[IFLA_VLAN_FLAGS]) { flags = nla_data(data[IFLA_VLAN_FLAGS]); if ((flags->flags & flags->mask) & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP | VLAN_FLAG_BRIDGE_BINDING)) { NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags"); return -EINVAL; } } err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]); if (err < 0) { NL_SET_ERR_MSG_MOD(extack, "Invalid ingress QOS map"); return err; } err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]); if (err < 0) { NL_SET_ERR_MSG_MOD(extack, "Invalid egress QOS map"); return err; } return 0; } static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ifla_vlan_flags *flags; struct ifla_vlan_qos_mapping *m; struct nlattr *attr; int rem, err; if (data[IFLA_VLAN_FLAGS]) { flags = nla_data(data[IFLA_VLAN_FLAGS]); err = vlan_dev_change_flags(dev, flags->flags, flags->mask); if (err) return err; } if (data[IFLA_VLAN_INGRESS_QOS]) { nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) { m = nla_data(attr); vlan_dev_set_ingress_priority(dev, m->to, m->from); } } if (data[IFLA_VLAN_EGRESS_QOS]) { nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) { m = nla_data(attr); err = vlan_dev_set_egress_priority(dev, m->from, m->to); if (err) return err; } } return 0; } static int vlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev; unsigned int max_mtu; __be16 proto; int err; if (!data[IFLA_VLAN_ID]) { NL_SET_ERR_MSG_MOD(extack, "VLAN id not specified"); return -EINVAL; } if (!tb[IFLA_LINK]) { NL_SET_ERR_MSG_MOD(extack, "link not specified"); return -EINVAL; } real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); if (!real_dev) { NL_SET_ERR_MSG_MOD(extack, "link does not exist"); return -ENODEV; } if (data[IFLA_VLAN_PROTOCOL]) proto = nla_get_be16(data[IFLA_VLAN_PROTOCOL]); else proto = htons(ETH_P_8021Q); vlan->vlan_proto = proto; vlan->vlan_id = nla_get_u16(data[IFLA_VLAN_ID]); vlan->real_dev = real_dev; dev->priv_flags |= (real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlan->flags = VLAN_FLAG_REORDER_HDR; err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id, extack); if (err < 0) return err; max_mtu = netif_reduces_vlan_mtu(real_dev) ? real_dev->mtu - VLAN_HLEN : real_dev->mtu; if (!tb[IFLA_MTU]) dev->mtu = max_mtu; else if (dev->mtu > max_mtu) return -EINVAL; /* Note: If this initial vlan_changelink() fails, we need * to call vlan_dev_free_egress_priority() to free memory. */ err = vlan_changelink(dev, tb, data, extack); if (!err) err = register_vlan_dev(dev, extack); if (err) vlan_dev_free_egress_priority(dev); return err; } static inline size_t vlan_qos_map_size(unsigned int n) { if (n == 0) return 0; /* IFLA_VLAN_{EGRESS,INGRESS}_QOS + n * IFLA_VLAN_QOS_MAPPING */ return nla_total_size(sizeof(struct nlattr)) + nla_total_size(sizeof(struct ifla_vlan_qos_mapping)) * n; } static size_t vlan_get_size(const struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); return nla_total_size(2) + /* IFLA_VLAN_PROTOCOL */ nla_total_size(2) + /* IFLA_VLAN_ID */ nla_total_size(sizeof(struct ifla_vlan_flags)) + /* IFLA_VLAN_FLAGS */ vlan_qos_map_size(vlan->nr_ingress_mappings) + vlan_qos_map_size(vlan->nr_egress_mappings); } static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_priority_tci_mapping *pm; struct ifla_vlan_flags f; struct ifla_vlan_qos_mapping m; struct nlattr *nest; unsigned int i; if (nla_put_be16(skb, IFLA_VLAN_PROTOCOL, vlan->vlan_proto) || nla_put_u16(skb, IFLA_VLAN_ID, vlan->vlan_id)) goto nla_put_failure; if (vlan->flags) { f.flags = vlan->flags; f.mask = ~0; if (nla_put(skb, IFLA_VLAN_FLAGS, sizeof(f), &f)) goto nla_put_failure; } if (vlan->nr_ingress_mappings) { nest = nla_nest_start_noflag(skb, IFLA_VLAN_INGRESS_QOS); if (nest == NULL) goto nla_put_failure; for (i = 0; i < ARRAY_SIZE(vlan->ingress_priority_map); i++) { if (!vlan->ingress_priority_map[i]) continue; m.from = i; m.to = vlan->ingress_priority_map[i]; if (nla_put(skb, IFLA_VLAN_QOS_MAPPING, sizeof(m), &m)) goto nla_put_failure; } nla_nest_end(skb, nest); } if (vlan->nr_egress_mappings) { nest = nla_nest_start_noflag(skb, IFLA_VLAN_EGRESS_QOS); if (nest == NULL) goto nla_put_failure; for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) { for (pm = vlan->egress_priority_map[i]; pm; pm = pm->next) { if (!pm->vlan_qos) continue; m.from = pm->priority; m.to = (pm->vlan_qos >> 13) & 0x7; if (nla_put(skb, IFLA_VLAN_QOS_MAPPING, sizeof(m), &m)) goto nla_put_failure; } } nla_nest_end(skb, nest); } return 0; nla_put_failure: return -EMSGSIZE; } static struct net *vlan_get_link_net(const struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; return dev_net(real_dev); } struct rtnl_link_ops vlan_link_ops __read_mostly = { .kind = "vlan", .maxtype = IFLA_VLAN_MAX, .policy = vlan_policy, .priv_size = sizeof(struct vlan_dev_priv), .setup = vlan_setup, .validate = vlan_validate, .newlink = vlan_newlink, .changelink = vlan_changelink, .dellink = unregister_vlan_dev, .get_size = vlan_get_size, .fill_info = vlan_fill_info, .get_link_net = vlan_get_link_net, }; int __init vlan_netlink_init(void) { return rtnl_link_register(&vlan_link_ops); } void __exit vlan_netlink_fini(void) { rtnl_link_unregister(&vlan_link_ops); } MODULE_ALIAS_RTNL_LINK("vlan");
653 653 655 1 654 76 10 10 76 655 642 153 652 654 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 // SPDX-License-Identifier: GPL-2.0-only /* * ratelimit.c - Do something with rate limit. * * Isolated from kernel/printk.c by Dave Young <hidave.darkstar@gmail.com> * * 2008-05-01 rewrite the function and use a ratelimit_state data struct as * parameter. Now every user can use their own standalone ratelimit_state. */ #include <linux/ratelimit.h> #include <linux/jiffies.h> #include <linux/export.h> /* * __ratelimit - rate limiting * @rs: ratelimit_state data * @func: name of calling function * * This enforces a rate limit: not more than @rs->burst callbacks * in every @rs->interval * * RETURNS: * 0 means callbacks will be suppressed. * 1 means go ahead and do it. */ int ___ratelimit(struct ratelimit_state *rs, const char *func) { /* Paired with WRITE_ONCE() in .proc_handler(). * Changing two values seperately could be inconsistent * and some message could be lost. (See: net_ratelimit_state). */ int interval = READ_ONCE(rs->interval); int burst = READ_ONCE(rs->burst); unsigned long flags; int ret; if (!interval) return 1; /* * If we contend on this state's lock then almost * by definition we are too busy to print a message, * in addition to the one that will be printed by * the entity that is holding the lock already: */ if (!raw_spin_trylock_irqsave(&rs->lock, flags)) return 0; if (!rs->begin) rs->begin = jiffies; if (time_is_before_jiffies(rs->begin + interval)) { if (rs->missed) { if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { printk_deferred(KERN_WARNING "%s: %d callbacks suppressed\n", func, rs->missed); rs->missed = 0; } } rs->begin = jiffies; rs->printed = 0; } if (burst && burst > rs->printed) { rs->printed++; ret = 1; } else { rs->missed++; ret = 0; } raw_spin_unlock_irqrestore(&rs->lock, flags); return ret; } EXPORT_SYMBOL(___ratelimit);
11 11 192 192 192 192 5 5 302 302 302 270 270 270 162 162 11 11 5 11 5 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 // SPDX-License-Identifier: GPL-2.0-only /* * Pluggable TCP congestion control support and newReno * congestion control. * Based on ideas from I/O scheduler support and Web100. * * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> */ #define pr_fmt(fmt) "TCP: " fmt #include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> #include <linux/gfp.h> #include <linux/jhash.h> #include <net/tcp.h> #include <trace/events/tcp.h> static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); /* Simple linear search, don't expect many entries! */ struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; list_for_each_entry_rcu(e, &tcp_cong_list, list) { if (strcmp(e->name, name) == 0) return e; } return NULL; } void tcp_set_ca_state(struct sock *sk, const u8 ca_state) { struct inet_connection_sock *icsk = inet_csk(sk); trace_tcp_cong_state_set(sk, ca_state); if (icsk->icsk_ca_ops->set_state) icsk->icsk_ca_ops->set_state(sk, ca_state); icsk->icsk_ca_state = ca_state; } /* Must be called with rcu lock held */ static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net, const char *name) { struct tcp_congestion_ops *ca = tcp_ca_find(name); #ifdef CONFIG_MODULES if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); request_module("tcp_%s", name); rcu_read_lock(); ca = tcp_ca_find(name); } #endif return ca; } /* Simple linear search, not much in here. */ struct tcp_congestion_ops *tcp_ca_find_key(u32 key) { struct tcp_congestion_ops *e; list_for_each_entry_rcu(e, &tcp_cong_list, list) { if (e->key == key) return e; } return NULL; } int tcp_validate_congestion_control(struct tcp_congestion_ops *ca) { /* all algorithms must implement these */ if (!ca->ssthresh || !ca->undo_cwnd || !(ca->cong_avoid || ca->cong_control)) { pr_err("%s does not implement required ops\n", ca->name); return -EINVAL; } return 0; } /* Attach new congestion control algorithm to the list * of available options. */ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) { int ret; ret = tcp_validate_congestion_control(ca); if (ret) return ret; ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); spin_lock(&tcp_cong_list_lock); if (ca->key == TCP_CA_UNSPEC || tcp_ca_find_key(ca->key)) { pr_notice("%s already registered or non-unique key\n", ca->name); ret = -EEXIST; } else { list_add_tail_rcu(&ca->list, &tcp_cong_list); pr_debug("%s registered\n", ca->name); } spin_unlock(&tcp_cong_list_lock); return ret; } EXPORT_SYMBOL_GPL(tcp_register_congestion_control); /* * Remove congestion control algorithm, called from * the module's remove function. Module ref counts are used * to ensure that this can't be done till all sockets using * that method are closed. */ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) { spin_lock(&tcp_cong_list_lock); list_del_rcu(&ca->list); spin_unlock(&tcp_cong_list_lock); /* Wait for outstanding readers to complete before the * module gets removed entirely. * * A try_module_get() should fail by now as our module is * in "going" state since no refs are held anymore and * module_exit() handler being called. */ synchronize_rcu(); } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); /* Replace a registered old ca with a new one. * * The new ca must have the same name as the old one, that has been * registered. */ int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca) { struct tcp_congestion_ops *existing; int ret; ret = tcp_validate_congestion_control(ca); if (ret) return ret; ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name)); spin_lock(&tcp_cong_list_lock); existing = tcp_ca_find_key(old_ca->key); if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) { pr_notice("%s not registered or non-unique key\n", ca->name); ret = -EINVAL; } else if (existing != old_ca) { pr_notice("invalid old congestion control algorithm to replace\n"); ret = -EINVAL; } else { /* Add the new one before removing the old one to keep * one implementation available all the time. */ list_add_tail_rcu(&ca->list, &tcp_cong_list); list_del_rcu(&existing->list); pr_debug("%s updated\n", ca->name); } spin_unlock(&tcp_cong_list_lock); /* Wait for outstanding readers to complete before the * module or struct_ops gets removed entirely. */ if (!ret) synchronize_rcu(); return ret; } u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; might_sleep(); rcu_read_lock(); ca = tcp_ca_find_autoload(net, name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; } rcu_read_unlock(); return key; } char *tcp_ca_get_name_by_key(u32 key, char *buffer) { const struct tcp_congestion_ops *ca; char *ret = NULL; rcu_read_lock(); ca = tcp_ca_find_key(key); if (ca) ret = strncpy(buffer, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); return ret; } /* Assign choice of congestion control. */ void tcp_assign_congestion_control(struct sock *sk) { struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); if (unlikely(!bpf_try_module_get(ca, ca->owner))) ca = &tcp_reno; icsk->icsk_ca_ops = ca; rcu_read_unlock(); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); } void tcp_init_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); icsk->icsk_ca_initialized = 1; } static void tcp_reinit_congestion_control(struct sock *sk, const struct tcp_congestion_ops *ca) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else INET_ECN_dontxmit(sk); if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) tcp_init_congestion_control(sk); } /* Manage refcounts on socket close. */ void tcp_cleanup_congestion_control(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ int tcp_set_default_congestion_control(struct net *net, const char *name) { struct tcp_congestion_ops *ca; const struct tcp_congestion_ops *prev; int ret; rcu_read_lock(); ca = tcp_ca_find_autoload(net, name); if (!ca) { ret = -ENOENT; } else if (!bpf_try_module_get(ca, ca->owner)) { ret = -EBUSY; } else if (!net_eq(net, &init_net) && !(ca->flags & TCP_CONG_NON_RESTRICTED)) { /* Only init netns can set default to a restricted algorithm */ ret = -EPERM; } else { prev = xchg(&net->ipv4.tcp_congestion_control, ca); if (prev) bpf_module_put(prev, prev->owner); ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; } rcu_read_unlock(); return ret; } /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { return tcp_set_default_congestion_control(&init_net, CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); /* Build string with list of available congestion control values */ void tcp_get_available_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } /* Get current default congestion control */ void tcp_get_default_congestion_control(struct net *net, char *name) { const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); strncpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } /* Built list of non-restricted congestion control values */ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; *buf = '\0'; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { if (!(ca->flags & TCP_CONG_NON_RESTRICTED)) continue; offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); if (WARN_ON_ONCE(offs >= maxlen)) break; } rcu_read_unlock(); } /* Change list of non-restricted congestion control */ int tcp_set_allowed_congestion_control(char *val) { struct tcp_congestion_ops *ca; char *saved_clone, *clone, *name; int ret = 0; saved_clone = clone = kstrdup(val, GFP_USER); if (!clone) return -ENOMEM; spin_lock(&tcp_cong_list_lock); /* pass 1 check for bad entries */ while ((name = strsep(&clone, " ")) && *name) { ca = tcp_ca_find(name); if (!ca) { ret = -ENOENT; goto out; } } /* pass 2 clear old values */ list_for_each_entry_rcu(ca, &tcp_cong_list, list) ca->flags &= ~TCP_CONG_NON_RESTRICTED; /* pass 3 mark as allowed */ while ((name = strsep(&val, " ")) && *name) { ca = tcp_ca_find(name); WARN_ON(!ca); if (ca) ca->flags |= TCP_CONG_NON_RESTRICTED; } out: spin_unlock(&tcp_cong_list_lock); kfree(saved_clone); return ret; } /* Change congestion control for socket. If load is false, then it is the * responsibility of the caller to call tcp_init_congestion_control or * tcp_reinit_congestion_control (if the current congestion control was * already initialized. */ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool cap_net_admin) { struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_congestion_ops *ca; int err = 0; if (icsk->icsk_ca_dst_locked) return -EPERM; rcu_read_lock(); if (!load) ca = tcp_ca_find(name); else ca = tcp_ca_find_autoload(sock_net(sk), name); /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { icsk->icsk_ca_setsockopt = 1; goto out; } if (!ca) err = -ENOENT; else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) err = -EPERM; else if (!bpf_try_module_get(ca, ca->owner)) err = -EBUSY; else tcp_reinit_congestion_control(sk, ca); out: rcu_read_unlock(); return err; } /* Slow start is used when congestion window is no greater than the slow start * threshold. We base on RFC2581 and also handle stretch ACKs properly. * We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but * something better;) a packet is only considered (s)acked in its entirety to * defend the ACK attacks described in the RFC. Slow start processes a stretch * ACK of degree N as if N acks of degree 1 are received back to back except * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ __bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh); acked -= cwnd - tcp_snd_cwnd(tp); tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); return acked; } EXPORT_SYMBOL_GPL(tcp_slow_start); /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w), * for every packet that was ACKed. */ __bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { /* If credits accumulated at a higher w, apply them gently now. */ if (tp->snd_cwnd_cnt >= w) { tp->snd_cwnd_cnt = 0; tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); } tp->snd_cwnd_cnt += acked; if (tp->snd_cwnd_cnt >= w) { u32 delta = tp->snd_cwnd_cnt / w; tp->snd_cwnd_cnt -= delta * w; tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta); } tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp)); } EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); /* * TCP Reno congestion control * This is special case used for fallback as well. */ /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ __bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_is_cwnd_limited(sk)) return; /* In "safe" area, increase. */ if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; } /* In dangerous area, increase slowly. */ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked); } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); /* Slow start threshold is half the congestion window (min 2) */ __bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp) >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); __bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp), tp->prior_cwnd); } EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd); struct tcp_congestion_ops tcp_reno = { .flags = TCP_CONG_NON_RESTRICTED, .name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .undo_cwnd = tcp_reno_undo_cwnd, };
21 21 15 6 230 230 223 222 222 41 25 25 25 41 222 71 177 178 231 231 266 23 265 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 /* * Copyright (c) 2018 Cumulus Networks. All rights reserved. * Copyright (c) 2018 David Ahern <dsa@cumulusnetworks.com> * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this * source tree. * * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/bitmap.h> #include <linux/in6.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/rhashtable.h> #include <linux/spinlock_types.h> #include <linux/types.h> #include <net/fib_notifier.h> #include <net/inet_dscp.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/fib_rules.h> #include <net/net_namespace.h> #include <net/nexthop.h> #include <linux/debugfs.h> #include "netdevsim.h" struct nsim_fib_entry { u64 max; atomic64_t num; }; struct nsim_per_fib_data { struct nsim_fib_entry fib; struct nsim_fib_entry rules; }; struct nsim_fib_data { struct notifier_block fib_nb; struct nsim_per_fib_data ipv4; struct nsim_per_fib_data ipv6; struct nsim_fib_entry nexthops; struct rhashtable fib_rt_ht; struct list_head fib_rt_list; struct mutex fib_lock; /* Protects FIB HT and list */ struct notifier_block nexthop_nb; struct rhashtable nexthop_ht; struct devlink *devlink; struct work_struct fib_event_work; struct work_struct fib_flush_work; struct list_head fib_event_queue; spinlock_t fib_event_queue_lock; /* Protects fib event queue list */ struct mutex nh_lock; /* Protects NH HT */ struct dentry *ddir; bool fail_route_offload; bool fail_res_nexthop_group_replace; bool fail_nexthop_bucket_replace; bool fail_route_delete; }; struct nsim_fib_rt_key { unsigned char addr[sizeof(struct in6_addr)]; unsigned char prefix_len; int family; u32 tb_id; }; struct nsim_fib_rt { struct nsim_fib_rt_key key; struct rhash_head ht_node; struct list_head list; /* Member of fib_rt_list */ }; struct nsim_fib4_rt { struct nsim_fib_rt common; struct fib_info *fi; dscp_t dscp; u8 type; }; struct nsim_fib6_rt { struct nsim_fib_rt common; struct list_head nh_list; unsigned int nhs; }; struct nsim_fib6_rt_nh { struct list_head list; /* Member of nh_list */ struct fib6_info *rt; }; struct nsim_fib6_event { struct fib6_info **rt_arr; unsigned int nrt6; }; struct nsim_fib_event { struct list_head list; /* node in fib queue */ union { struct fib_entry_notifier_info fen_info; struct nsim_fib6_event fib6_event; }; struct nsim_fib_data *data; unsigned long event; int family; }; static const struct rhashtable_params nsim_fib_rt_ht_params = { .key_offset = offsetof(struct nsim_fib_rt, key), .head_offset = offsetof(struct nsim_fib_rt, ht_node), .key_len = sizeof(struct nsim_fib_rt_key), .automatic_shrinking = true, }; struct nsim_nexthop { struct rhash_head ht_node; u64 occ; u32 id; bool is_resilient; }; static const struct rhashtable_params nsim_nexthop_ht_params = { .key_offset = offsetof(struct nsim_nexthop, id), .head_offset = offsetof(struct nsim_nexthop, ht_node), .key_len = sizeof(u32), .automatic_shrinking = true, }; u64 nsim_fib_get_val(struct nsim_fib_data *fib_data, enum nsim_resource_id res_id, bool max) { struct nsim_fib_entry *entry; switch (res_id) { case NSIM_RESOURCE_IPV4_FIB: entry = &fib_data->ipv4.fib; break; case NSIM_RESOURCE_IPV4_FIB_RULES: entry = &fib_data->ipv4.rules; break; case NSIM_RESOURCE_IPV6_FIB: entry = &fib_data->ipv6.fib; break; case NSIM_RESOURCE_IPV6_FIB_RULES: entry = &fib_data->ipv6.rules; break; case NSIM_RESOURCE_NEXTHOPS: entry = &fib_data->nexthops; break; default: return 0; } return max ? entry->max : atomic64_read(&entry->num); } static void nsim_fib_set_max(struct nsim_fib_data *fib_data, enum nsim_resource_id res_id, u64 val) { struct nsim_fib_entry *entry; switch (res_id) { case NSIM_RESOURCE_IPV4_FIB: entry = &fib_data->ipv4.fib; break; case NSIM_RESOURCE_IPV4_FIB_RULES: entry = &fib_data->ipv4.rules; break; case NSIM_RESOURCE_IPV6_FIB: entry = &fib_data->ipv6.fib; break; case NSIM_RESOURCE_IPV6_FIB_RULES: entry = &fib_data->ipv6.rules; break; case NSIM_RESOURCE_NEXTHOPS: entry = &fib_data->nexthops; break; default: WARN_ON(1); return; } entry->max = val; } static int nsim_fib_rule_account(struct nsim_fib_entry *entry, bool add, struct netlink_ext_ack *extack) { int err = 0; if (add) { if (!atomic64_add_unless(&entry->num, 1, entry->max)) { err = -ENOSPC; NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported fib rule entries"); } } else { atomic64_dec_if_positive(&entry->num); } return err; } static int nsim_fib_rule_event(struct nsim_fib_data *data, struct fib_notifier_info *info, bool add) { struct netlink_ext_ack *extack = info->extack; int err = 0; switch (info->family) { case AF_INET: err = nsim_fib_rule_account(&data->ipv4.rules, add, extack); break; case AF_INET6: err = nsim_fib_rule_account(&data->ipv6.rules, add, extack); break; } return err; } static int nsim_fib_account(struct nsim_fib_entry *entry, bool add) { int err = 0; if (add) { if (!atomic64_add_unless(&entry->num, 1, entry->max)) err = -ENOSPC; } else { atomic64_dec_if_positive(&entry->num); } return err; } static void nsim_fib_rt_init(struct nsim_fib_data *data, struct nsim_fib_rt *fib_rt, const void *addr, size_t addr_len, unsigned int prefix_len, int family, u32 tb_id) { memcpy(fib_rt->key.addr, addr, addr_len); fib_rt->key.prefix_len = prefix_len; fib_rt->key.family = family; fib_rt->key.tb_id = tb_id; list_add(&fib_rt->list, &data->fib_rt_list); } static void nsim_fib_rt_fini(struct nsim_fib_rt *fib_rt) { list_del(&fib_rt->list); } static struct nsim_fib_rt *nsim_fib_rt_lookup(struct rhashtable *fib_rt_ht, const void *addr, size_t addr_len, unsigned int prefix_len, int family, u32 tb_id) { struct nsim_fib_rt_key key; memset(&key, 0, sizeof(key)); memcpy(key.addr, addr, addr_len); key.prefix_len = prefix_len; key.family = family; key.tb_id = tb_id; return rhashtable_lookup_fast(fib_rt_ht, &key, nsim_fib_rt_ht_params); } static struct nsim_fib4_rt * nsim_fib4_rt_create(struct nsim_fib_data *data, struct fib_entry_notifier_info *fen_info) { struct nsim_fib4_rt *fib4_rt; fib4_rt = kzalloc(sizeof(*fib4_rt), GFP_KERNEL); if (!fib4_rt) return NULL; nsim_fib_rt_init(data, &fib4_rt->common, &fen_info->dst, sizeof(u32), fen_info->dst_len, AF_INET, fen_info->tb_id); fib4_rt->fi = fen_info->fi; fib_info_hold(fib4_rt->fi); fib4_rt->dscp = fen_info->dscp; fib4_rt->type = fen_info->type; return fib4_rt; } static void nsim_fib4_rt_destroy(struct nsim_fib4_rt *fib4_rt) { fib_info_put(fib4_rt->fi); nsim_fib_rt_fini(&fib4_rt->common); kfree(fib4_rt); } static struct nsim_fib4_rt * nsim_fib4_rt_lookup(struct rhashtable *fib_rt_ht, const struct fib_entry_notifier_info *fen_info) { struct nsim_fib_rt *fib_rt; fib_rt = nsim_fib_rt_lookup(fib_rt_ht, &fen_info->dst, sizeof(u32), fen_info->dst_len, AF_INET, fen_info->tb_id); if (!fib_rt) return NULL; return container_of(fib_rt, struct nsim_fib4_rt, common); } static void nsim_fib4_rt_offload_failed_flag_set(struct net *net, struct fib_entry_notifier_info *fen_info) { u32 *p_dst = (u32 *)&fen_info->dst; struct fib_rt_info fri; fri.fi = fen_info->fi; fri.tb_id = fen_info->tb_id; fri.dst = cpu_to_be32(*p_dst); fri.dst_len = fen_info->dst_len; fri.dscp = fen_info->dscp; fri.type = fen_info->type; fri.offload = false; fri.trap = false; fri.offload_failed = true; fib_alias_hw_flags_set(net, &fri); } static void nsim_fib4_rt_hw_flags_set(struct net *net, const struct nsim_fib4_rt *fib4_rt, bool trap) { u32 *p_dst = (u32 *) fib4_rt->common.key.addr; int dst_len = fib4_rt->common.key.prefix_len; struct fib_rt_info fri; fri.fi = fib4_rt->fi; fri.tb_id = fib4_rt->common.key.tb_id; fri.dst = cpu_to_be32(*p_dst); fri.dst_len = dst_len; fri.dscp = fib4_rt->dscp; fri.type = fib4_rt->type; fri.offload = false; fri.trap = trap; fri.offload_failed = false; fib_alias_hw_flags_set(net, &fri); } static int nsim_fib4_rt_add(struct nsim_fib_data *data, struct nsim_fib4_rt *fib4_rt) { struct net *net = devlink_net(data->devlink); int err; err = rhashtable_insert_fast(&data->fib_rt_ht, &fib4_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) goto err_fib_dismiss; /* Simulate hardware programming latency. */ msleep(1); nsim_fib4_rt_hw_flags_set(net, fib4_rt, true); return 0; err_fib_dismiss: /* Drop the accounting that was increased from the notification * context when FIB_EVENT_ENTRY_REPLACE was triggered. */ nsim_fib_account(&data->ipv4.fib, false); return err; } static int nsim_fib4_rt_replace(struct nsim_fib_data *data, struct nsim_fib4_rt *fib4_rt, struct nsim_fib4_rt *fib4_rt_old) { struct net *net = devlink_net(data->devlink); int err; /* We are replacing a route, so need to remove the accounting which * was increased when FIB_EVENT_ENTRY_REPLACE was triggered. */ err = nsim_fib_account(&data->ipv4.fib, false); if (err) return err; err = rhashtable_replace_fast(&data->fib_rt_ht, &fib4_rt_old->common.ht_node, &fib4_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) return err; msleep(1); nsim_fib4_rt_hw_flags_set(net, fib4_rt, true); nsim_fib4_rt_hw_flags_set(net, fib4_rt_old, false); nsim_fib4_rt_destroy(fib4_rt_old); return 0; } static int nsim_fib4_rt_insert(struct nsim_fib_data *data, struct fib_entry_notifier_info *fen_info) { struct nsim_fib4_rt *fib4_rt, *fib4_rt_old; int err; if (data->fail_route_offload) { /* For testing purposes, user set debugfs fail_route_offload * value to true. Simulate hardware programming latency and then * fail. */ msleep(1); return -EINVAL; } fib4_rt = nsim_fib4_rt_create(data, fen_info); if (!fib4_rt) return -ENOMEM; fib4_rt_old = nsim_fib4_rt_lookup(&data->fib_rt_ht, fen_info); if (!fib4_rt_old) err = nsim_fib4_rt_add(data, fib4_rt); else err = nsim_fib4_rt_replace(data, fib4_rt, fib4_rt_old); if (err) nsim_fib4_rt_destroy(fib4_rt); return err; } static void nsim_fib4_rt_remove(struct nsim_fib_data *data, const struct fib_entry_notifier_info *fen_info) { struct nsim_fib4_rt *fib4_rt; fib4_rt = nsim_fib4_rt_lookup(&data->fib_rt_ht, fen_info); if (!fib4_rt) return; rhashtable_remove_fast(&data->fib_rt_ht, &fib4_rt->common.ht_node, nsim_fib_rt_ht_params); nsim_fib4_rt_destroy(fib4_rt); } static int nsim_fib4_event(struct nsim_fib_data *data, struct fib_entry_notifier_info *fen_info, unsigned long event) { int err = 0; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib4_rt_insert(data, fen_info); if (err) { struct net *net = devlink_net(data->devlink); nsim_fib4_rt_offload_failed_flag_set(net, fen_info); } break; case FIB_EVENT_ENTRY_DEL: nsim_fib4_rt_remove(data, fen_info); break; default: break; } return err; } static struct nsim_fib6_rt_nh * nsim_fib6_rt_nh_find(const struct nsim_fib6_rt *fib6_rt, const struct fib6_info *rt) { struct nsim_fib6_rt_nh *fib6_rt_nh; list_for_each_entry(fib6_rt_nh, &fib6_rt->nh_list, list) { if (fib6_rt_nh->rt == rt) return fib6_rt_nh; } return NULL; } static int nsim_fib6_rt_nh_add(struct nsim_fib6_rt *fib6_rt, struct fib6_info *rt) { struct nsim_fib6_rt_nh *fib6_rt_nh; fib6_rt_nh = kzalloc(sizeof(*fib6_rt_nh), GFP_KERNEL); if (!fib6_rt_nh) return -ENOMEM; fib6_info_hold(rt); fib6_rt_nh->rt = rt; list_add_tail(&fib6_rt_nh->list, &fib6_rt->nh_list); fib6_rt->nhs++; return 0; } #if IS_ENABLED(CONFIG_IPV6) static void nsim_rt6_release(struct fib6_info *rt) { fib6_info_release(rt); } #else static void nsim_rt6_release(struct fib6_info *rt) { } #endif static void nsim_fib6_rt_nh_del(struct nsim_fib6_rt *fib6_rt, const struct fib6_info *rt) { struct nsim_fib6_rt_nh *fib6_rt_nh; fib6_rt_nh = nsim_fib6_rt_nh_find(fib6_rt, rt); if (!fib6_rt_nh) return; fib6_rt->nhs--; list_del(&fib6_rt_nh->list); nsim_rt6_release(fib6_rt_nh->rt); kfree(fib6_rt_nh); } static struct nsim_fib6_rt * nsim_fib6_rt_create(struct nsim_fib_data *data, struct fib6_info **rt_arr, unsigned int nrt6) { struct fib6_info *rt = rt_arr[0]; struct nsim_fib6_rt *fib6_rt; int i = 0; int err; fib6_rt = kzalloc(sizeof(*fib6_rt), GFP_KERNEL); if (!fib6_rt) return ERR_PTR(-ENOMEM); nsim_fib_rt_init(data, &fib6_rt->common, &rt->fib6_dst.addr, sizeof(rt->fib6_dst.addr), rt->fib6_dst.plen, AF_INET6, rt->fib6_table->tb6_id); /* We consider a multipath IPv6 route as one entry, but it can be made * up from several fib6_info structs (one for each nexthop), so we * add them all to the same list under the entry. */ INIT_LIST_HEAD(&fib6_rt->nh_list); for (i = 0; i < nrt6; i++) { err = nsim_fib6_rt_nh_add(fib6_rt, rt_arr[i]); if (err) goto err_fib6_rt_nh_del; } return fib6_rt; err_fib6_rt_nh_del: for (i--; i >= 0; i--) { nsim_fib6_rt_nh_del(fib6_rt, rt_arr[i]); } nsim_fib_rt_fini(&fib6_rt->common); kfree(fib6_rt); return ERR_PTR(err); } static void nsim_fib6_rt_destroy(struct nsim_fib6_rt *fib6_rt) { struct nsim_fib6_rt_nh *iter, *tmp; list_for_each_entry_safe(iter, tmp, &fib6_rt->nh_list, list) nsim_fib6_rt_nh_del(fib6_rt, iter->rt); WARN_ON_ONCE(!list_empty(&fib6_rt->nh_list)); nsim_fib_rt_fini(&fib6_rt->common); kfree(fib6_rt); } static struct nsim_fib6_rt * nsim_fib6_rt_lookup(struct rhashtable *fib_rt_ht, const struct fib6_info *rt) { struct nsim_fib_rt *fib_rt; fib_rt = nsim_fib_rt_lookup(fib_rt_ht, &rt->fib6_dst.addr, sizeof(rt->fib6_dst.addr), rt->fib6_dst.plen, AF_INET6, rt->fib6_table->tb6_id); if (!fib_rt) return NULL; return container_of(fib_rt, struct nsim_fib6_rt, common); } static int nsim_fib6_rt_append(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event) { struct fib6_info *rt = fib6_event->rt_arr[0]; struct nsim_fib6_rt *fib6_rt; int i, err; if (data->fail_route_offload) { /* For testing purposes, user set debugfs fail_route_offload * value to true. Simulate hardware programming latency and then * fail. */ msleep(1); return -EINVAL; } fib6_rt = nsim_fib6_rt_lookup(&data->fib_rt_ht, rt); if (!fib6_rt) return -EINVAL; for (i = 0; i < fib6_event->nrt6; i++) { err = nsim_fib6_rt_nh_add(fib6_rt, fib6_event->rt_arr[i]); if (err) goto err_fib6_rt_nh_del; WRITE_ONCE(fib6_event->rt_arr[i]->trap, true); } return 0; err_fib6_rt_nh_del: for (i--; i >= 0; i--) { WRITE_ONCE(fib6_event->rt_arr[i]->trap, false); nsim_fib6_rt_nh_del(fib6_rt, fib6_event->rt_arr[i]); } return err; } #if IS_ENABLED(CONFIG_IPV6) static void nsim_fib6_rt_offload_failed_flag_set(struct nsim_fib_data *data, struct fib6_info **rt_arr, unsigned int nrt6) { struct net *net = devlink_net(data->devlink); int i; for (i = 0; i < nrt6; i++) fib6_info_hw_flags_set(net, rt_arr[i], false, false, true); } #else static void nsim_fib6_rt_offload_failed_flag_set(struct nsim_fib_data *data, struct fib6_info **rt_arr, unsigned int nrt6) { } #endif #if IS_ENABLED(CONFIG_IPV6) static void nsim_fib6_rt_hw_flags_set(struct nsim_fib_data *data, const struct nsim_fib6_rt *fib6_rt, bool trap) { struct net *net = devlink_net(data->devlink); struct nsim_fib6_rt_nh *fib6_rt_nh; list_for_each_entry(fib6_rt_nh, &fib6_rt->nh_list, list) fib6_info_hw_flags_set(net, fib6_rt_nh->rt, false, trap, false); } #else static void nsim_fib6_rt_hw_flags_set(struct nsim_fib_data *data, const struct nsim_fib6_rt *fib6_rt, bool trap) { } #endif static int nsim_fib6_rt_add(struct nsim_fib_data *data, struct nsim_fib6_rt *fib6_rt) { int err; err = rhashtable_insert_fast(&data->fib_rt_ht, &fib6_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) goto err_fib_dismiss; msleep(1); nsim_fib6_rt_hw_flags_set(data, fib6_rt, true); return 0; err_fib_dismiss: /* Drop the accounting that was increased from the notification * context when FIB_EVENT_ENTRY_REPLACE was triggered. */ nsim_fib_account(&data->ipv6.fib, false); return err; } static int nsim_fib6_rt_replace(struct nsim_fib_data *data, struct nsim_fib6_rt *fib6_rt, struct nsim_fib6_rt *fib6_rt_old) { int err; /* We are replacing a route, so need to remove the accounting which * was increased when FIB_EVENT_ENTRY_REPLACE was triggered. */ err = nsim_fib_account(&data->ipv6.fib, false); if (err) return err; err = rhashtable_replace_fast(&data->fib_rt_ht, &fib6_rt_old->common.ht_node, &fib6_rt->common.ht_node, nsim_fib_rt_ht_params); if (err) return err; msleep(1); nsim_fib6_rt_hw_flags_set(data, fib6_rt, true); nsim_fib6_rt_hw_flags_set(data, fib6_rt_old, false); nsim_fib6_rt_destroy(fib6_rt_old); return 0; } static int nsim_fib6_rt_insert(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event) { struct fib6_info *rt = fib6_event->rt_arr[0]; struct nsim_fib6_rt *fib6_rt, *fib6_rt_old; int err; if (data->fail_route_offload) { /* For testing purposes, user set debugfs fail_route_offload * value to true. Simulate hardware programming latency and then * fail. */ msleep(1); return -EINVAL; } fib6_rt = nsim_fib6_rt_create(data, fib6_event->rt_arr, fib6_event->nrt6); if (IS_ERR(fib6_rt)) return PTR_ERR(fib6_rt); fib6_rt_old = nsim_fib6_rt_lookup(&data->fib_rt_ht, rt); if (!fib6_rt_old) err = nsim_fib6_rt_add(data, fib6_rt); else err = nsim_fib6_rt_replace(data, fib6_rt, fib6_rt_old); if (err) nsim_fib6_rt_destroy(fib6_rt); return err; } static void nsim_fib6_rt_remove(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event) { struct fib6_info *rt = fib6_event->rt_arr[0]; struct nsim_fib6_rt *fib6_rt; int i; /* Multipath routes are first added to the FIB trie and only then * notified. If we vetoed the addition, we will get a delete * notification for a route we do not have. Therefore, do not warn if * route was not found. */ fib6_rt = nsim_fib6_rt_lookup(&data->fib_rt_ht, rt); if (!fib6_rt) return; /* If not all the nexthops are deleted, then only reduce the nexthop * group. */ if (fib6_event->nrt6 != fib6_rt->nhs) { for (i = 0; i < fib6_event->nrt6; i++) nsim_fib6_rt_nh_del(fib6_rt, fib6_event->rt_arr[i]); return; } rhashtable_remove_fast(&data->fib_rt_ht, &fib6_rt->common.ht_node, nsim_fib_rt_ht_params); nsim_fib6_rt_destroy(fib6_rt); } static int nsim_fib6_event_init(struct nsim_fib6_event *fib6_event, struct fib6_entry_notifier_info *fen6_info) { struct fib6_info *rt = fen6_info->rt; struct fib6_info **rt_arr; struct fib6_info *iter; unsigned int nrt6; int i = 0; nrt6 = fen6_info->nsiblings + 1; rt_arr = kcalloc(nrt6, sizeof(struct fib6_info *), GFP_ATOMIC); if (!rt_arr) return -ENOMEM; fib6_event->rt_arr = rt_arr; fib6_event->nrt6 = nrt6; rt_arr[0] = rt; fib6_info_hold(rt); if (!fen6_info->nsiblings) return 0; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (i == fen6_info->nsiblings) break; rt_arr[i + 1] = iter; fib6_info_hold(iter); i++; } WARN_ON_ONCE(i != fen6_info->nsiblings); return 0; } static void nsim_fib6_event_fini(struct nsim_fib6_event *fib6_event) { int i; for (i = 0; i < fib6_event->nrt6; i++) nsim_rt6_release(fib6_event->rt_arr[i]); kfree(fib6_event->rt_arr); } static int nsim_fib6_event(struct nsim_fib_data *data, struct nsim_fib6_event *fib6_event, unsigned long event) { int err; if (fib6_event->rt_arr[0]->fib6_src.plen) return 0; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib6_rt_insert(data, fib6_event); if (err) goto err_rt_offload_failed_flag_set; break; case FIB_EVENT_ENTRY_APPEND: err = nsim_fib6_rt_append(data, fib6_event); if (err) goto err_rt_offload_failed_flag_set; break; case FIB_EVENT_ENTRY_DEL: nsim_fib6_rt_remove(data, fib6_event); break; default: break; } return 0; err_rt_offload_failed_flag_set: nsim_fib6_rt_offload_failed_flag_set(data, fib6_event->rt_arr, fib6_event->nrt6); return err; } static void nsim_fib_event(struct nsim_fib_event *fib_event) { switch (fib_event->family) { case AF_INET: nsim_fib4_event(fib_event->data, &fib_event->fen_info, fib_event->event); fib_info_put(fib_event->fen_info.fi); break; case AF_INET6: nsim_fib6_event(fib_event->data, &fib_event->fib6_event, fib_event->event); nsim_fib6_event_fini(&fib_event->fib6_event); break; } } static int nsim_fib4_prepare_event(struct fib_notifier_info *info, struct nsim_fib_event *fib_event, unsigned long event) { struct nsim_fib_data *data = fib_event->data; struct fib_entry_notifier_info *fen_info; struct netlink_ext_ack *extack; int err = 0; fen_info = container_of(info, struct fib_entry_notifier_info, info); fib_event->fen_info = *fen_info; extack = info->extack; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib_account(&data->ipv4.fib, true); if (err) { NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported fib entries"); return err; } break; case FIB_EVENT_ENTRY_DEL: if (data->fail_route_delete) { NL_SET_ERR_MSG_MOD(extack, "Failed to process route deletion"); return -EINVAL; } nsim_fib_account(&data->ipv4.fib, false); break; } /* Take reference on fib_info to prevent it from being * freed while event is queued. Release it afterwards. */ fib_info_hold(fib_event->fen_info.fi); return 0; } static int nsim_fib6_prepare_event(struct fib_notifier_info *info, struct nsim_fib_event *fib_event, unsigned long event) { struct nsim_fib_data *data = fib_event->data; struct fib6_entry_notifier_info *fen6_info; struct netlink_ext_ack *extack; int err = 0; fen6_info = container_of(info, struct fib6_entry_notifier_info, info); err = nsim_fib6_event_init(&fib_event->fib6_event, fen6_info); if (err) return err; extack = info->extack; switch (event) { case FIB_EVENT_ENTRY_REPLACE: err = nsim_fib_account(&data->ipv6.fib, true); if (err) { NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported fib entries"); goto err_fib6_event_fini; } break; case FIB_EVENT_ENTRY_DEL: if (data->fail_route_delete) { err = -EINVAL; NL_SET_ERR_MSG_MOD(extack, "Failed to process route deletion"); goto err_fib6_event_fini; } nsim_fib_account(&data->ipv6.fib, false); break; } return 0; err_fib6_event_fini: nsim_fib6_event_fini(&fib_event->fib6_event); return err; } static int nsim_fib_event_schedule_work(struct nsim_fib_data *data, struct fib_notifier_info *info, unsigned long event) { struct nsim_fib_event *fib_event; int err; if (info->family != AF_INET && info->family != AF_INET6) /* netdevsim does not support 'RTNL_FAMILY_IP6MR' and * 'RTNL_FAMILY_IPMR' and should ignore them. */ return NOTIFY_DONE; fib_event = kzalloc(sizeof(*fib_event), GFP_ATOMIC); if (!fib_event) goto err_fib_event_alloc; fib_event->data = data; fib_event->event = event; fib_event->family = info->family; switch (info->family) { case AF_INET: err = nsim_fib4_prepare_event(info, fib_event, event); break; case AF_INET6: err = nsim_fib6_prepare_event(info, fib_event, event); break; } if (err) goto err_fib_prepare_event; /* Enqueue the event and trigger the work */ spin_lock_bh(&data->fib_event_queue_lock); list_add_tail(&fib_event->list, &data->fib_event_queue); spin_unlock_bh(&data->fib_event_queue_lock); schedule_work(&data->fib_event_work); return NOTIFY_DONE; err_fib_prepare_event: kfree(fib_event); err_fib_event_alloc: if (event == FIB_EVENT_ENTRY_DEL) schedule_work(&data->fib_flush_work); return NOTIFY_BAD; } static int nsim_fib_event_nb(struct notifier_block *nb, unsigned long event, void *ptr) { struct nsim_fib_data *data = container_of(nb, struct nsim_fib_data, fib_nb); struct fib_notifier_info *info = ptr; int err; switch (event) { case FIB_EVENT_RULE_ADD: case FIB_EVENT_RULE_DEL: err = nsim_fib_rule_event(data, info, event == FIB_EVENT_RULE_ADD); return notifier_from_errno(err); case FIB_EVENT_ENTRY_REPLACE: case FIB_EVENT_ENTRY_APPEND: case FIB_EVENT_ENTRY_DEL: return nsim_fib_event_schedule_work(data, info, event); } return NOTIFY_DONE; } static void nsim_fib4_rt_free(struct nsim_fib_rt *fib_rt, struct nsim_fib_data *data) { struct devlink *devlink = data->devlink; struct nsim_fib4_rt *fib4_rt; fib4_rt = container_of(fib_rt, struct nsim_fib4_rt, common); nsim_fib4_rt_hw_flags_set(devlink_net(devlink), fib4_rt, false); nsim_fib_account(&data->ipv4.fib, false); nsim_fib4_rt_destroy(fib4_rt); } static void nsim_fib6_rt_free(struct nsim_fib_rt *fib_rt, struct nsim_fib_data *data) { struct nsim_fib6_rt *fib6_rt; fib6_rt = container_of(fib_rt, struct nsim_fib6_rt, common); nsim_fib6_rt_hw_flags_set(data, fib6_rt, false); nsim_fib_account(&data->ipv6.fib, false); nsim_fib6_rt_destroy(fib6_rt); } static void nsim_fib_rt_free(void *ptr, void *arg) { struct nsim_fib_rt *fib_rt = ptr; struct nsim_fib_data *data = arg; switch (fib_rt->key.family) { case AF_INET: nsim_fib4_rt_free(fib_rt, data); break; case AF_INET6: nsim_fib6_rt_free(fib_rt, data); break; default: WARN_ON_ONCE(1); } } /* inconsistent dump, trying again */ static void nsim_fib_dump_inconsistent(struct notifier_block *nb) { struct nsim_fib_data *data = container_of(nb, struct nsim_fib_data, fib_nb); struct nsim_fib_rt *fib_rt, *fib_rt_tmp; /* Flush the work to make sure there is no race with notifications. */ flush_work(&data->fib_event_work); /* The notifier block is still not registered, so we do not need to * take any locks here. */ list_for_each_entry_safe(fib_rt, fib_rt_tmp, &data->fib_rt_list, list) { rhashtable_remove_fast(&data->fib_rt_ht, &fib_rt->ht_node, nsim_fib_rt_ht_params); nsim_fib_rt_free(fib_rt, data); } atomic64_set(&data->ipv4.rules.num, 0ULL); atomic64_set(&data->ipv6.rules.num, 0ULL); } static struct nsim_nexthop *nsim_nexthop_create(struct nsim_fib_data *data, struct nh_notifier_info *info) { struct nsim_nexthop *nexthop; u64 occ = 0; int i; nexthop = kzalloc(sizeof(*nexthop), GFP_KERNEL); if (!nexthop) return ERR_PTR(-ENOMEM); nexthop->id = info->id; /* Determine the number of nexthop entries the new nexthop will * occupy. */ switch (info->type) { case NH_NOTIFIER_INFO_TYPE_SINGLE: occ = 1; break; case NH_NOTIFIER_INFO_TYPE_GRP: for (i = 0; i < info->nh_grp->num_nh; i++) occ += info->nh_grp->nh_entries[i].weight; break; case NH_NOTIFIER_INFO_TYPE_RES_TABLE: occ = info->nh_res_table->num_nh_buckets; nexthop->is_resilient = true; break; default: NL_SET_ERR_MSG_MOD(info->extack, "Unsupported nexthop type"); kfree(nexthop); return ERR_PTR(-EOPNOTSUPP); } nexthop->occ = occ; return nexthop; } static void nsim_nexthop_destroy(struct nsim_nexthop *nexthop) { kfree(nexthop); } static int nsim_nexthop_account(struct nsim_fib_data *data, u64 occ, bool add, struct netlink_ext_ack *extack) { int i, err = 0; if (add) { for (i = 0; i < occ; i++) if (!atomic64_add_unless(&data->nexthops.num, 1, data->nexthops.max)) { err = -ENOSPC; NL_SET_ERR_MSG_MOD(extack, "Exceeded number of supported nexthops"); goto err_num_decrease; } } else { if (WARN_ON(occ > atomic64_read(&data->nexthops.num))) return -EINVAL; atomic64_sub(occ, &data->nexthops.num); } return err; err_num_decrease: atomic64_sub(i, &data->nexthops.num); return err; } static void nsim_nexthop_hw_flags_set(struct net *net, const struct nsim_nexthop *nexthop, bool trap) { int i; nexthop_set_hw_flags(net, nexthop->id, false, trap); if (!nexthop->is_resilient) return; for (i = 0; i < nexthop->occ; i++) nexthop_bucket_set_hw_flags(net, nexthop->id, i, false, trap); } static int nsim_nexthop_add(struct nsim_fib_data *data, struct nsim_nexthop *nexthop, struct netlink_ext_ack *extack) { struct net *net = devlink_net(data->devlink); int err; err = nsim_nexthop_account(data, nexthop->occ, true, extack); if (err) return err; err = rhashtable_insert_fast(&data->nexthop_ht, &nexthop->ht_node, nsim_nexthop_ht_params); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to insert nexthop"); goto err_nexthop_dismiss; } nsim_nexthop_hw_flags_set(net, nexthop, true); return 0; err_nexthop_dismiss: nsim_nexthop_account(data, nexthop->occ, false, extack); return err; } static int nsim_nexthop_replace(struct nsim_fib_data *data, struct nsim_nexthop *nexthop, struct nsim_nexthop *nexthop_old, struct netlink_ext_ack *extack) { struct net *net = devlink_net(data->devlink); int err; err = nsim_nexthop_account(data, nexthop->occ, true, extack); if (err) return err; err = rhashtable_replace_fast(&data->nexthop_ht, &nexthop_old->ht_node, &nexthop->ht_node, nsim_nexthop_ht_params); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to replace nexthop"); goto err_nexthop_dismiss; } nsim_nexthop_hw_flags_set(net, nexthop, true); nsim_nexthop_account(data, nexthop_old->occ, false, extack); nsim_nexthop_destroy(nexthop_old); return 0; err_nexthop_dismiss: nsim_nexthop_account(data, nexthop->occ, false, extack); return err; } static int nsim_nexthop_insert(struct nsim_fib_data *data, struct nh_notifier_info *info) { struct nsim_nexthop *nexthop, *nexthop_old; int err; nexthop = nsim_nexthop_create(data, info); if (IS_ERR(nexthop)) return PTR_ERR(nexthop); nexthop_old = rhashtable_lookup_fast(&data->nexthop_ht, &info->id, nsim_nexthop_ht_params); if (!nexthop_old) err = nsim_nexthop_add(data, nexthop, info->extack); else err = nsim_nexthop_replace(data, nexthop, nexthop_old, info->extack); if (err) nsim_nexthop_destroy(nexthop); return err; } static void nsim_nexthop_remove(struct nsim_fib_data *data, struct nh_notifier_info *info) { struct nsim_nexthop *nexthop; nexthop = rhashtable_lookup_fast(&data->nexthop_ht, &info->id, nsim_nexthop_ht_params); if (!nexthop) return; rhashtable_remove_fast(&data->nexthop_ht, &nexthop->ht_node, nsim_nexthop_ht_params); nsim_nexthop_account(data, nexthop->occ, false, info->extack); nsim_nexthop_destroy(nexthop); } static int nsim_nexthop_res_table_pre_replace(struct nsim_fib_data *data, struct nh_notifier_info *info) { if (data->fail_res_nexthop_group_replace) { NL_SET_ERR_MSG_MOD(info->extack, "Failed to replace a resilient nexthop group"); return -EINVAL; } return 0; } static int nsim_nexthop_bucket_replace(struct nsim_fib_data *data, struct nh_notifier_info *info) { if (data->fail_nexthop_bucket_replace) { NL_SET_ERR_MSG_MOD(info->extack, "Failed to replace nexthop bucket"); return -EINVAL; } nexthop_bucket_set_hw_flags(info->net, info->id, info->nh_res_bucket->bucket_index, false, true); return 0; } static int nsim_nexthop_event_nb(struct notifier_block *nb, unsigned long event, void *ptr) { struct nsim_fib_data *data = container_of(nb, struct nsim_fib_data, nexthop_nb); struct nh_notifier_info *info = ptr; int err = 0; mutex_lock(&data->nh_lock); switch (event) { case NEXTHOP_EVENT_REPLACE: err = nsim_nexthop_insert(data, info); break; case NEXTHOP_EVENT_DEL: nsim_nexthop_remove(data, info); break; case NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE: err = nsim_nexthop_res_table_pre_replace(data, info); break; case NEXTHOP_EVENT_BUCKET_REPLACE: err = nsim_nexthop_bucket_replace(data, info); break; default: break; } mutex_unlock(&data->nh_lock); return notifier_from_errno(err); } static void nsim_nexthop_free(void *ptr, void *arg) { struct nsim_nexthop *nexthop = ptr; struct nsim_fib_data *data = arg; struct net *net; net = devlink_net(data->devlink); nsim_nexthop_hw_flags_set(net, nexthop, false); nsim_nexthop_account(data, nexthop->occ, false, NULL); nsim_nexthop_destroy(nexthop); } static ssize_t nsim_nexthop_bucket_activity_write(struct file *file, const char __user *user_buf, size_t size, loff_t *ppos) { struct nsim_fib_data *data = file->private_data; struct net *net = devlink_net(data->devlink); struct nsim_nexthop *nexthop; unsigned long *activity; loff_t pos = *ppos; u16 bucket_index; char buf[128]; int err = 0; u32 nhid; if (pos != 0) return -EINVAL; if (size > sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, size)) return -EFAULT; if (sscanf(buf, "%u %hu", &nhid, &bucket_index) != 2) return -EINVAL; rtnl_lock(); nexthop = rhashtable_lookup_fast(&data->nexthop_ht, &nhid, nsim_nexthop_ht_params); if (!nexthop || !nexthop->is_resilient || bucket_index >= nexthop->occ) { err = -EINVAL; goto out; } activity = bitmap_zalloc(nexthop->occ, GFP_KERNEL); if (!activity) { err = -ENOMEM; goto out; } bitmap_set(activity, bucket_index, 1); nexthop_res_grp_activity_update(net, nhid, nexthop->occ, activity); bitmap_free(activity); out: rtnl_unlock(); *ppos = size; return err ?: size; } static const struct file_operations nsim_nexthop_bucket_activity_fops = { .open = simple_open, .write = nsim_nexthop_bucket_activity_write, .llseek = no_llseek, .owner = THIS_MODULE, }; static u64 nsim_fib_ipv4_resource_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV4_FIB, false); } static u64 nsim_fib_ipv4_rules_res_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV4_FIB_RULES, false); } static u64 nsim_fib_ipv6_resource_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV6_FIB, false); } static u64 nsim_fib_ipv6_rules_res_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_IPV6_FIB_RULES, false); } static u64 nsim_fib_nexthops_res_occ_get(void *priv) { struct nsim_fib_data *data = priv; return nsim_fib_get_val(data, NSIM_RESOURCE_NEXTHOPS, false); } static void nsim_fib_set_max_all(struct nsim_fib_data *data, struct devlink *devlink) { static const enum nsim_resource_id res_ids[] = { NSIM_RESOURCE_IPV4_FIB, NSIM_RESOURCE_IPV4_FIB_RULES, NSIM_RESOURCE_IPV6_FIB, NSIM_RESOURCE_IPV6_FIB_RULES, NSIM_RESOURCE_NEXTHOPS, }; int i; for (i = 0; i < ARRAY_SIZE(res_ids); i++) { int err; u64 val; err = devl_resource_size_get(devlink, res_ids[i], &val); if (err) val = (u64) -1; nsim_fib_set_max(data, res_ids[i], val); } } static void nsim_fib_event_work(struct work_struct *work) { struct nsim_fib_data *data = container_of(work, struct nsim_fib_data, fib_event_work); struct nsim_fib_event *fib_event, *next_fib_event; LIST_HEAD(fib_event_queue); spin_lock_bh(&data->fib_event_queue_lock); list_splice_init(&data->fib_event_queue, &fib_event_queue); spin_unlock_bh(&data->fib_event_queue_lock); mutex_lock(&data->fib_lock); list_for_each_entry_safe(fib_event, next_fib_event, &fib_event_queue, list) { nsim_fib_event(fib_event); list_del(&fib_event->list); kfree(fib_event); cond_resched(); } mutex_unlock(&data->fib_lock); } static void nsim_fib_flush_work(struct work_struct *work) { struct nsim_fib_data *data = container_of(work, struct nsim_fib_data, fib_flush_work); struct nsim_fib_rt *fib_rt, *fib_rt_tmp; /* Process pending work. */ flush_work(&data->fib_event_work); mutex_lock(&data->fib_lock); list_for_each_entry_safe(fib_rt, fib_rt_tmp, &data->fib_rt_list, list) { rhashtable_remove_fast(&data->fib_rt_ht, &fib_rt->ht_node, nsim_fib_rt_ht_params); nsim_fib_rt_free(fib_rt, data); } mutex_unlock(&data->fib_lock); } static int nsim_fib_debugfs_init(struct nsim_fib_data *data, struct nsim_dev *nsim_dev) { data->ddir = debugfs_create_dir("fib", nsim_dev->ddir); if (IS_ERR(data->ddir)) return PTR_ERR(data->ddir); data->fail_route_offload = false; debugfs_create_bool("fail_route_offload", 0600, data->ddir, &data->fail_route_offload); data->fail_res_nexthop_group_replace = false; debugfs_create_bool("fail_res_nexthop_group_replace", 0600, data->ddir, &data->fail_res_nexthop_group_replace); data->fail_nexthop_bucket_replace = false; debugfs_create_bool("fail_nexthop_bucket_replace", 0600, data->ddir, &data->fail_nexthop_bucket_replace); debugfs_create_file("nexthop_bucket_activity", 0200, data->ddir, data, &nsim_nexthop_bucket_activity_fops); data->fail_route_delete = false; debugfs_create_bool("fail_route_delete", 0600, data->ddir, &data->fail_route_delete); return 0; } static void nsim_fib_debugfs_exit(struct nsim_fib_data *data) { debugfs_remove_recursive(data->ddir); } struct nsim_fib_data *nsim_fib_create(struct devlink *devlink, struct netlink_ext_ack *extack) { struct nsim_fib_data *data; struct nsim_dev *nsim_dev; int err; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return ERR_PTR(-ENOMEM); data->devlink = devlink; nsim_dev = devlink_priv(devlink); err = nsim_fib_debugfs_init(data, nsim_dev); if (err) goto err_data_free; mutex_init(&data->nh_lock); err = rhashtable_init(&data->nexthop_ht, &nsim_nexthop_ht_params); if (err) goto err_debugfs_exit; mutex_init(&data->fib_lock); INIT_LIST_HEAD(&data->fib_rt_list); err = rhashtable_init(&data->fib_rt_ht, &nsim_fib_rt_ht_params); if (err) goto err_rhashtable_nexthop_destroy; INIT_WORK(&data->fib_event_work, nsim_fib_event_work); INIT_WORK(&data->fib_flush_work, nsim_fib_flush_work); INIT_LIST_HEAD(&data->fib_event_queue); spin_lock_init(&data->fib_event_queue_lock); nsim_fib_set_max_all(data, devlink); data->nexthop_nb.notifier_call = nsim_nexthop_event_nb; err = register_nexthop_notifier(devlink_net(devlink), &data->nexthop_nb, extack); if (err) { pr_err("Failed to register nexthop notifier\n"); goto err_rhashtable_fib_destroy; } data->fib_nb.notifier_call = nsim_fib_event_nb; err = register_fib_notifier(devlink_net(devlink), &data->fib_nb, nsim_fib_dump_inconsistent, extack); if (err) { pr_err("Failed to register fib notifier\n"); goto err_nexthop_nb_unregister; } devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV4_FIB, nsim_fib_ipv4_resource_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV4_FIB_RULES, nsim_fib_ipv4_rules_res_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV6_FIB, nsim_fib_ipv6_resource_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_IPV6_FIB_RULES, nsim_fib_ipv6_rules_res_occ_get, data); devl_resource_occ_get_register(devlink, NSIM_RESOURCE_NEXTHOPS, nsim_fib_nexthops_res_occ_get, data); return data; err_nexthop_nb_unregister: unregister_nexthop_notifier(devlink_net(devlink), &data->nexthop_nb); err_rhashtable_fib_destroy: cancel_work_sync(&data->fib_flush_work); flush_work(&data->fib_event_work); rhashtable_free_and_destroy(&data->fib_rt_ht, nsim_fib_rt_free, data); err_rhashtable_nexthop_destroy: rhashtable_free_and_destroy(&data->nexthop_ht, nsim_nexthop_free, data); mutex_destroy(&data->fib_lock); err_debugfs_exit: mutex_destroy(&data->nh_lock); nsim_fib_debugfs_exit(data); err_data_free: kfree(data); return ERR_PTR(err); } void nsim_fib_destroy(struct devlink *devlink, struct nsim_fib_data *data) { devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_NEXTHOPS); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV6_FIB_RULES); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV6_FIB); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV4_FIB_RULES); devl_resource_occ_get_unregister(devlink, NSIM_RESOURCE_IPV4_FIB); unregister_fib_notifier(devlink_net(devlink), &data->fib_nb); unregister_nexthop_notifier(devlink_net(devlink), &data->nexthop_nb); cancel_work_sync(&data->fib_flush_work); flush_work(&data->fib_event_work); rhashtable_free_and_destroy(&data->fib_rt_ht, nsim_fib_rt_free, data); rhashtable_free_and_destroy(&data->nexthop_ht, nsim_nexthop_free, data); WARN_ON_ONCE(!list_empty(&data->fib_event_queue)); WARN_ON_ONCE(!list_empty(&data->fib_rt_list)); mutex_destroy(&data->fib_lock); mutex_destroy(&data->nh_lock); nsim_fib_debugfs_exit(data); kfree(data); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * acpi.h - ACPI Interface * * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> */ #ifndef _LINUX_ACPI_H #define _LINUX_ACPI_H #include <linux/errno.h> #include <linux/ioport.h> /* for struct resource */ #include <linux/resource_ext.h> #include <linux/device.h> #include <linux/mod_devicetable.h> #include <linux/property.h> #include <linux/uuid.h> struct irq_domain; struct irq_domain_ops; #ifndef _LINUX #define _LINUX #endif #include <acpi/acpi.h> #ifdef CONFIG_ACPI #include <linux/list.h> #include <linux/dynamic_debug.h> #include <linux/module.h> #include <linux/mutex.h> #include <acpi/acpi_bus.h> #include <acpi/acpi_drivers.h> #include <acpi/acpi_numa.h> #include <acpi/acpi_io.h> #include <asm/acpi.h> static inline acpi_handle acpi_device_handle(struct acpi_device *adev) { return adev ? adev->handle : NULL; } #define ACPI_COMPANION(dev) to_acpi_device_node((dev)->fwnode) #define ACPI_COMPANION_SET(dev, adev) set_primary_fwnode(dev, (adev) ? \ acpi_fwnode_handle(adev) : NULL) #define ACPI_HANDLE(dev) acpi_device_handle(ACPI_COMPANION(dev)) #define ACPI_HANDLE_FWNODE(fwnode) \ acpi_device_handle(to_acpi_device_node(fwnode)) static inline struct fwnode_handle *acpi_alloc_fwnode_static(void) { struct fwnode_handle *fwnode; fwnode = kzalloc(sizeof(struct fwnode_handle), GFP_KERNEL); if (!fwnode) return NULL; fwnode_init(fwnode, &acpi_static_fwnode_ops); return fwnode; } static inline void acpi_free_fwnode_static(struct fwnode_handle *fwnode) { if (WARN_ON(!is_acpi_static_node(fwnode))) return; kfree(fwnode); } static inline bool has_acpi_companion(struct device *dev) { return is_acpi_device_node(dev->fwnode); } static inline void acpi_preset_companion(struct device *dev, struct acpi_device *parent, u64 addr) { ACPI_COMPANION_SET(dev, acpi_find_child_device(parent, addr, false)); } static inline const char *acpi_dev_name(struct acpi_device *adev) { return dev_name(&adev->dev); } struct device *acpi_get_first_physical_node(struct acpi_device *adev); enum acpi_irq_model_id { ACPI_IRQ_MODEL_PIC = 0, ACPI_IRQ_MODEL_IOAPIC, ACPI_IRQ_MODEL_IOSAPIC, ACPI_IRQ_MODEL_PLATFORM, ACPI_IRQ_MODEL_GIC, ACPI_IRQ_MODEL_LPIC, ACPI_IRQ_MODEL_COUNT }; extern enum acpi_irq_model_id acpi_irq_model; enum acpi_interrupt_id { ACPI_INTERRUPT_PMI = 1, ACPI_INTERRUPT_INIT, ACPI_INTERRUPT_CPEI, ACPI_INTERRUPT_COUNT }; #define ACPI_SPACE_MEM 0 enum acpi_address_range_id { ACPI_ADDRESS_RANGE_MEMORY = 1, ACPI_ADDRESS_RANGE_RESERVED = 2, ACPI_ADDRESS_RANGE_ACPI = 3, ACPI_ADDRESS_RANGE_NVS = 4, ACPI_ADDRESS_RANGE_COUNT }; /* Table Handlers */ union acpi_subtable_headers { struct acpi_subtable_header common; struct acpi_hmat_structure hmat; struct acpi_prmt_module_header prmt; struct acpi_cedt_header cedt; }; typedef int (*acpi_tbl_table_handler)(struct acpi_table_header *table); typedef int (*acpi_tbl_entry_handler)(union acpi_subtable_headers *header, const unsigned long end); typedef int (*acpi_tbl_entry_handler_arg)(union acpi_subtable_headers *header, void *arg, const unsigned long end); /* Debugger support */ struct acpi_debugger_ops { int (*create_thread)(acpi_osd_exec_callback function, void *context); ssize_t (*write_log)(const char *msg); ssize_t (*read_cmd)(char *buffer, size_t length); int (*wait_command_ready)(bool single_step, char *buffer, size_t length); int (*notify_command_complete)(void); }; struct acpi_debugger { const struct acpi_debugger_ops *ops; struct module *owner; struct mutex lock; }; #ifdef CONFIG_ACPI_DEBUGGER int __init acpi_debugger_init(void); int acpi_register_debugger(struct module *owner, const struct acpi_debugger_ops *ops); void acpi_unregister_debugger(const struct acpi_debugger_ops *ops); int acpi_debugger_create_thread(acpi_osd_exec_callback function, void *context); ssize_t acpi_debugger_write_log(const char *msg); ssize_t acpi_debugger_read_cmd(char *buffer, size_t buffer_length); int acpi_debugger_wait_command_ready(void); int acpi_debugger_notify_command_complete(void); #else static inline int acpi_debugger_init(void) { return -ENODEV; } static inline int acpi_register_debugger(struct module *owner, const struct acpi_debugger_ops *ops) { return -ENODEV; } static inline void acpi_unregister_debugger(const struct acpi_debugger_ops *ops) { } static inline int acpi_debugger_create_thread(acpi_osd_exec_callback function, void *context) { return -ENODEV; } static inline int acpi_debugger_write_log(const char *msg) { return -ENODEV; } static inline int acpi_debugger_read_cmd(char *buffer, u32 buffer_length) { return -ENODEV; } static inline int acpi_debugger_wait_command_ready(void) { return -ENODEV; } static inline int acpi_debugger_notify_command_complete(void) { return -ENODEV; } #endif #define BAD_MADT_ENTRY(entry, end) ( \ (!entry) || (unsigned long)entry + sizeof(*entry) > end || \ ((struct acpi_subtable_header *)entry)->length < sizeof(*entry)) struct acpi_subtable_proc { int id; acpi_tbl_entry_handler handler; acpi_tbl_entry_handler_arg handler_arg; void *arg; int count; }; void __iomem *__acpi_map_table(unsigned long phys, unsigned long size); void __acpi_unmap_table(void __iomem *map, unsigned long size); int early_acpi_boot_init(void); int acpi_boot_init (void); void acpi_boot_table_prepare (void); void acpi_boot_table_init (void); int acpi_mps_check (void); int acpi_numa_init (void); int acpi_locate_initial_tables (void); void acpi_reserve_initial_tables (void); void acpi_table_init_complete (void); int acpi_table_init (void); #ifdef CONFIG_ACPI_TABLE_LIB #define EXPORT_SYMBOL_ACPI_LIB(x) EXPORT_SYMBOL_NS_GPL(x, ACPI) #define __init_or_acpilib #define __initdata_or_acpilib #else #define EXPORT_SYMBOL_ACPI_LIB(x) #define __init_or_acpilib __init #define __initdata_or_acpilib __initdata #endif int acpi_table_parse(char *id, acpi_tbl_table_handler handler); int __init_or_acpilib acpi_table_parse_entries(char *id, unsigned long table_size, int entry_id, acpi_tbl_entry_handler handler, unsigned int max_entries); int __init_or_acpilib acpi_table_parse_entries_array(char *id, unsigned long table_size, struct acpi_subtable_proc *proc, int proc_num, unsigned int max_entries); int acpi_table_parse_madt(enum acpi_madt_type id, acpi_tbl_entry_handler handler, unsigned int max_entries); int __init_or_acpilib acpi_table_parse_cedt(enum acpi_cedt_type id, acpi_tbl_entry_handler_arg handler_arg, void *arg); int acpi_parse_mcfg (struct acpi_table_header *header); void acpi_table_print_madt_entry (struct acpi_subtable_header *madt); /* the following numa functions are architecture-dependent */ void acpi_numa_slit_init (struct acpi_table_slit *slit); #if defined(CONFIG_X86) || defined(CONFIG_IA64) || defined(CONFIG_LOONGARCH) void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa); #else static inline void acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { } #endif void acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa); #if defined(CONFIG_ARM64) || defined(CONFIG_LOONGARCH) void acpi_arch_dma_setup(struct device *dev); #else static inline void acpi_arch_dma_setup(struct device *dev) { } #endif #ifdef CONFIG_ARM64 void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa); #else static inline void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { } #endif int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma); #ifndef PHYS_CPUID_INVALID typedef u32 phys_cpuid_t; #define PHYS_CPUID_INVALID (phys_cpuid_t)(-1) #endif static inline bool invalid_logical_cpuid(u32 cpuid) { return (int)cpuid < 0; } static inline bool invalid_phys_cpuid(phys_cpuid_t phys_id) { return phys_id == PHYS_CPUID_INVALID; } /* Validate the processor object's proc_id */ bool acpi_duplicate_processor_id(int proc_id); /* Processor _CTS control */ struct acpi_processor_power; #ifdef CONFIG_ACPI_PROCESSOR_CSTATE bool acpi_processor_claim_cst_control(void); int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu, struct acpi_processor_power *info); #else static inline bool acpi_processor_claim_cst_control(void) { return false; } static inline int acpi_processor_evaluate_cst(acpi_handle handle, u32 cpu, struct acpi_processor_power *info) { return -ENODEV; } #endif #ifdef CONFIG_ACPI_HOTPLUG_CPU /* Arch dependent functions for cpu hotplug support */ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, int *pcpu); int acpi_unmap_cpu(int cpu); #endif /* CONFIG_ACPI_HOTPLUG_CPU */ #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr); #endif int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base); int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base); int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base); void acpi_irq_stats_init(void); extern u32 acpi_irq_handled; extern u32 acpi_irq_not_handled; extern unsigned int acpi_sci_irq; extern bool acpi_no_s5; #define INVALID_ACPI_IRQ ((unsigned)-1) static inline bool acpi_sci_irq_valid(void) { return acpi_sci_irq != INVALID_ACPI_IRQ; } extern int sbf_port; extern unsigned long acpi_realmode_flags; int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity); int acpi_gsi_to_irq (u32 gsi, unsigned int *irq); int acpi_isa_irq_to_gsi (unsigned isa_irq, u32 *gsi); void acpi_set_irq_model(enum acpi_irq_model_id model, struct fwnode_handle *(*)(u32)); void acpi_set_gsi_to_irq_fallback(u32 (*)(u32)); struct irq_domain *acpi_irq_create_hierarchy(unsigned int flags, unsigned int size, struct fwnode_handle *fwnode, const struct irq_domain_ops *ops, void *host_data); #ifdef CONFIG_X86_IO_APIC extern int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity); #else static inline int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) { return -1; } #endif /* * This function undoes the effect of one call to acpi_register_gsi(). * If this matches the last registration, any IRQ resources for gsi * are freed. */ void acpi_unregister_gsi (u32 gsi); struct pci_dev; int acpi_pci_irq_enable (struct pci_dev *dev); void acpi_penalize_isa_irq(int irq, int active); bool acpi_isa_irq_available(int irq); #ifdef CONFIG_PCI void acpi_penalize_sci_irq(int irq, int trigger, int polarity); #else static inline void acpi_penalize_sci_irq(int irq, int trigger, int polarity) { } #endif void acpi_pci_irq_disable (struct pci_dev *dev); extern int ec_read(u8 addr, u8 *val); extern int ec_write(u8 addr, u8 val); extern int ec_transaction(u8 command, const u8 *wdata, unsigned wdata_len, u8 *rdata, unsigned rdata_len); extern acpi_handle ec_get_handle(void); extern bool acpi_is_pnp_device(struct acpi_device *); #if defined(CONFIG_ACPI_WMI) || defined(CONFIG_ACPI_WMI_MODULE) typedef void (*wmi_notify_handler) (u32 value, void *context); int wmi_instance_count(const char *guid); extern acpi_status wmi_evaluate_method(const char *guid, u8 instance, u32 method_id, const struct acpi_buffer *in, struct acpi_buffer *out); extern acpi_status wmi_query_block(const char *guid, u8 instance, struct acpi_buffer *out); extern acpi_status wmi_set_block(const char *guid, u8 instance, const struct acpi_buffer *in); extern acpi_status wmi_install_notify_handler(const char *guid, wmi_notify_handler handler, void *data); extern acpi_status wmi_remove_notify_handler(const char *guid); extern acpi_status wmi_get_event_data(u32 event, struct acpi_buffer *out); extern bool wmi_has_guid(const char *guid); extern char *wmi_get_acpi_device_uid(const char *guid); #endif /* CONFIG_ACPI_WMI */ #define ACPI_VIDEO_OUTPUT_SWITCHING 0x0001 #define ACPI_VIDEO_DEVICE_POSTING 0x0002 #define ACPI_VIDEO_ROM_AVAILABLE 0x0004 #define ACPI_VIDEO_BACKLIGHT 0x0008 #define ACPI_VIDEO_BACKLIGHT_FORCE_VENDOR 0x0010 #define ACPI_VIDEO_BACKLIGHT_FORCE_VIDEO 0x0020 #define ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VENDOR 0x0040 #define ACPI_VIDEO_OUTPUT_SWITCHING_FORCE_VIDEO 0x0080 #define ACPI_VIDEO_BACKLIGHT_DMI_VENDOR 0x0100 #define ACPI_VIDEO_BACKLIGHT_DMI_VIDEO 0x0200 #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VENDOR 0x0400 #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VIDEO 0x0800 extern char acpi_video_backlight_string[]; extern long acpi_is_video_device(acpi_handle handle); extern int acpi_blacklisted(void); extern void acpi_osi_setup(char *str); extern bool acpi_osi_is_win8(void); #ifdef CONFIG_ACPI_NUMA int acpi_map_pxm_to_node(int pxm); int acpi_get_node(acpi_handle handle); /** * pxm_to_online_node - Map proximity ID to online node * @pxm: ACPI proximity ID * * This is similar to pxm_to_node(), but always returns an online * node. When the mapped node from a given proximity ID is offline, it * looks up the node distance table and returns the nearest online node. * * ACPI device drivers, which are called after the NUMA initialization has * completed in the kernel, can call this interface to obtain their device * NUMA topology from ACPI tables. Such drivers do not have to deal with * offline nodes. A node may be offline when SRAT memory entry does not exist, * or NUMA is disabled, ex. "numa=off" on x86. */ static inline int pxm_to_online_node(int pxm) { int node = pxm_to_node(pxm); return numa_map_to_online_node(node); } #else static inline int pxm_to_online_node(int pxm) { return 0; } static inline int acpi_map_pxm_to_node(int pxm) { return 0; } static inline int acpi_get_node(acpi_handle handle) { return 0; } #endif extern int pnpacpi_disabled; #define PXM_INVAL (-1) bool acpi_dev_resource_memory(struct acpi_resource *ares, struct resource *res); bool acpi_dev_resource_io(struct acpi_resource *ares, struct resource *res); bool acpi_dev_resource_address_space(struct acpi_resource *ares, struct resource_win *win); bool acpi_dev_resource_ext_address_space(struct acpi_resource *ares, struct resource_win *win); unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable, u8 wake_capable); unsigned int acpi_dev_get_irq_type(int triggering, int polarity); bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index, struct resource *res); void acpi_dev_free_resource_list(struct list_head *list); int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list, int (*preproc)(struct acpi_resource *, void *), void *preproc_data); int acpi_dev_get_dma_resources(struct acpi_device *adev, struct list_head *list); int acpi_dev_get_memory_resources(struct acpi_device *adev, struct list_head *list); int acpi_dev_filter_resource_type(struct acpi_resource *ares, unsigned long types); static inline int acpi_dev_filter_resource_type_cb(struct acpi_resource *ares, void *arg) { return acpi_dev_filter_resource_type(ares, (unsigned long)arg); } struct acpi_device *acpi_resource_consumer(struct resource *res); int acpi_check_resource_conflict(const struct resource *res); int acpi_check_region(resource_size_t start, resource_size_t n, const char *name); int acpi_resources_are_enforced(void); #ifdef CONFIG_HIBERNATION extern int acpi_check_s4_hw_signature; #endif #ifdef CONFIG_PM_SLEEP void __init acpi_old_suspend_ordering(void); void __init acpi_nvs_nosave(void); void __init acpi_nvs_nosave_s3(void); void __init acpi_sleep_no_blacklist(void); #endif /* CONFIG_PM_SLEEP */ int acpi_register_wakeup_handler( int wake_irq, bool (*wakeup)(void *context), void *context); void acpi_unregister_wakeup_handler( bool (*wakeup)(void *context), void *context); struct acpi_osc_context { char *uuid_str; /* UUID string */ int rev; struct acpi_buffer cap; /* list of DWORD capabilities */ struct acpi_buffer ret; /* free by caller if success */ }; acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); /* Number of _OSC capability DWORDS depends on bridge type */ #define OSC_PCI_CAPABILITY_DWORDS 3 #define OSC_CXL_CAPABILITY_DWORDS 5 /* Indexes into _OSC Capabilities Buffer (DWORDs 2 to 5 are device-specific) */ #define OSC_QUERY_DWORD 0 /* DWORD 1 */ #define OSC_SUPPORT_DWORD 1 /* DWORD 2 */ #define OSC_CONTROL_DWORD 2 /* DWORD 3 */ #define OSC_EXT_SUPPORT_DWORD 3 /* DWORD 4 */ #define OSC_EXT_CONTROL_DWORD 4 /* DWORD 5 */ /* _OSC Capabilities DWORD 1: Query/Control and Error Returns (generic) */ #define OSC_QUERY_ENABLE 0x00000001 /* input */ #define OSC_REQUEST_ERROR 0x00000002 /* return */ #define OSC_INVALID_UUID_ERROR 0x00000004 /* return */ #define OSC_INVALID_REVISION_ERROR 0x00000008 /* return */ #define OSC_CAPABILITIES_MASK_ERROR 0x00000010 /* return */ /* Platform-Wide Capabilities _OSC: Capabilities DWORD 2: Support Field */ #define OSC_SB_PAD_SUPPORT 0x00000001 #define OSC_SB_PPC_OST_SUPPORT 0x00000002 #define OSC_SB_PR3_SUPPORT 0x00000004 #define OSC_SB_HOTPLUG_OST_SUPPORT 0x00000008 #define OSC_SB_APEI_SUPPORT 0x00000010 #define OSC_SB_CPC_SUPPORT 0x00000020 #define OSC_SB_CPCV2_SUPPORT 0x00000040 #define OSC_SB_PCLPI_SUPPORT 0x00000080 #define OSC_SB_OSLPI_SUPPORT 0x00000100 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT 0x00001000 #define OSC_SB_GENERIC_INITIATOR_SUPPORT 0x00002000 #define OSC_SB_CPC_FLEXIBLE_ADR_SPACE 0x00004000 #define OSC_SB_NATIVE_USB4_SUPPORT 0x00040000 #define OSC_SB_PRM_SUPPORT 0x00200000 #define OSC_SB_FFH_OPR_SUPPORT 0x00400000 extern bool osc_sb_apei_support_acked; extern bool osc_pc_lpi_support_confirmed; extern bool osc_sb_native_usb4_support_confirmed; extern bool osc_sb_cppc2_support_acked; extern bool osc_cpc_flexible_adr_space_confirmed; /* USB4 Capabilities */ #define OSC_USB_USB3_TUNNELING 0x00000001 #define OSC_USB_DP_TUNNELING 0x00000002 #define OSC_USB_PCIE_TUNNELING 0x00000004 #define OSC_USB_XDOMAIN 0x00000008 extern u32 osc_sb_native_usb4_control; /* PCI Host Bridge _OSC: Capabilities DWORD 2: Support Field */ #define OSC_PCI_EXT_CONFIG_SUPPORT 0x00000001 #define OSC_PCI_ASPM_SUPPORT 0x00000002 #define OSC_PCI_CLOCK_PM_SUPPORT 0x00000004 #define OSC_PCI_SEGMENT_GROUPS_SUPPORT 0x00000008 #define OSC_PCI_MSI_SUPPORT 0x00000010 #define OSC_PCI_EDR_SUPPORT 0x00000080 #define OSC_PCI_HPX_TYPE_3_SUPPORT 0x00000100 /* PCI Host Bridge _OSC: Capabilities DWORD 3: Control Field */ #define OSC_PCI_EXPRESS_NATIVE_HP_CONTROL 0x00000001 #define OSC_PCI_SHPC_NATIVE_HP_CONTROL 0x00000002 #define OSC_PCI_EXPRESS_PME_CONTROL 0x00000004 #define OSC_PCI_EXPRESS_AER_CONTROL 0x00000008 #define OSC_PCI_EXPRESS_CAPABILITY_CONTROL 0x00000010 #define OSC_PCI_EXPRESS_LTR_CONTROL 0x00000020 #define OSC_PCI_EXPRESS_DPC_CONTROL 0x00000080 /* CXL _OSC: Capabilities DWORD 4: Support Field */ #define OSC_CXL_1_1_PORT_REG_ACCESS_SUPPORT 0x00000001 #define OSC_CXL_2_0_PORT_DEV_REG_ACCESS_SUPPORT 0x00000002 #define OSC_CXL_PROTOCOL_ERR_REPORTING_SUPPORT 0x00000004 #define OSC_CXL_NATIVE_HP_SUPPORT 0x00000008 /* CXL _OSC: Capabilities DWORD 5: Control Field */ #define OSC_CXL_ERROR_REPORTING_CONTROL 0x00000001 static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context) { u32 *ret = context->ret.pointer; return ret[OSC_CONTROL_DWORD]; } static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context) { u32 *ret = context->ret.pointer; return ret[OSC_EXT_CONTROL_DWORD]; } #define ACPI_GSB_ACCESS_ATTRIB_QUICK 0x00000002 #define ACPI_GSB_ACCESS_ATTRIB_SEND_RCV 0x00000004 #define ACPI_GSB_ACCESS_ATTRIB_BYTE 0x00000006 #define ACPI_GSB_ACCESS_ATTRIB_WORD 0x00000008 #define ACPI_GSB_ACCESS_ATTRIB_BLOCK 0x0000000A #define ACPI_GSB_ACCESS_ATTRIB_MULTIBYTE 0x0000000B #define ACPI_GSB_ACCESS_ATTRIB_WORD_CALL 0x0000000C #define ACPI_GSB_ACCESS_ATTRIB_BLOCK_CALL 0x0000000D #define ACPI_GSB_ACCESS_ATTRIB_RAW_BYTES 0x0000000E #define ACPI_GSB_ACCESS_ATTRIB_RAW_PROCESS 0x0000000F /* Enable _OST when all relevant hotplug operations are enabled */ #if defined(CONFIG_ACPI_HOTPLUG_CPU) && \ defined(CONFIG_ACPI_HOTPLUG_MEMORY) && \ defined(CONFIG_ACPI_CONTAINER) #define ACPI_HOTPLUG_OST #endif /* _OST Source Event Code (OSPM Action) */ #define ACPI_OST_EC_OSPM_SHUTDOWN 0x100 #define ACPI_OST_EC_OSPM_EJECT 0x103 #define ACPI_OST_EC_OSPM_INSERTION 0x200 /* _OST General Processing Status Code */ #define ACPI_OST_SC_SUCCESS 0x0 #define ACPI_OST_SC_NON_SPECIFIC_FAILURE 0x1 #define ACPI_OST_SC_UNRECOGNIZED_NOTIFY 0x2 /* _OST OS Shutdown Processing (0x100) Status Code */ #define ACPI_OST_SC_OS_SHUTDOWN_DENIED 0x80 #define ACPI_OST_SC_OS_SHUTDOWN_IN_PROGRESS 0x81 #define ACPI_OST_SC_OS_SHUTDOWN_COMPLETED 0x82 #define ACPI_OST_SC_OS_SHUTDOWN_NOT_SUPPORTED 0x83 /* _OST Ejection Request (0x3, 0x103) Status Code */ #define ACPI_OST_SC_EJECT_NOT_SUPPORTED 0x80 #define ACPI_OST_SC_DEVICE_IN_USE 0x81 #define ACPI_OST_SC_DEVICE_BUSY 0x82 #define ACPI_OST_SC_EJECT_DEPENDENCY_BUSY 0x83 #define ACPI_OST_SC_EJECT_IN_PROGRESS 0x84 /* _OST Insertion Request (0x200) Status Code */ #define ACPI_OST_SC_INSERT_IN_PROGRESS 0x80 #define ACPI_OST_SC_DRIVER_LOAD_FAILURE 0x81 #define ACPI_OST_SC_INSERT_NOT_SUPPORTED 0x82 enum acpi_predicate { all_versions, less_than_or_equal, equal, greater_than_or_equal, }; /* Table must be terminted by a NULL entry */ struct acpi_platform_list { char oem_id[ACPI_OEM_ID_SIZE+1]; char oem_table_id[ACPI_OEM_TABLE_ID_SIZE+1]; u32 oem_revision; char *table; enum acpi_predicate pred; char *reason; u32 data; }; int acpi_match_platform_list(const struct acpi_platform_list *plat); extern void acpi_early_init(void); extern void acpi_subsystem_init(void); extern int acpi_nvs_register(__u64 start, __u64 size); extern int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *), void *data); const struct acpi_device_id *acpi_match_acpi_device(const struct acpi_device_id *ids, const struct acpi_device *adev); const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids, const struct device *dev); const void *acpi_device_get_match_data(const struct device *dev); extern bool acpi_driver_match_device(struct device *dev, const struct device_driver *drv); int acpi_device_uevent_modalias(const struct device *, struct kobj_uevent_env *); int acpi_device_modalias(struct device *, char *, int); struct platform_device *acpi_create_platform_device(struct acpi_device *, const struct property_entry *); #define ACPI_PTR(_ptr) (_ptr) static inline void acpi_device_set_enumerated(struct acpi_device *adev) { adev->flags.visited = true; } static inline void acpi_device_clear_enumerated(struct acpi_device *adev) { adev->flags.visited = false; } enum acpi_reconfig_event { ACPI_RECONFIG_DEVICE_ADD = 0, ACPI_RECONFIG_DEVICE_REMOVE, }; int acpi_reconfig_notifier_register(struct notifier_block *nb); int acpi_reconfig_notifier_unregister(struct notifier_block *nb); #ifdef CONFIG_ACPI_GTDT int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count); int acpi_gtdt_map_ppi(int type); bool acpi_gtdt_c3stop(int type); int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count); #endif #ifndef ACPI_HAVE_ARCH_SET_ROOT_POINTER static inline void acpi_arch_set_root_pointer(u64 addr) { } #endif #ifndef ACPI_HAVE_ARCH_GET_ROOT_POINTER static inline u64 acpi_arch_get_root_pointer(void) { return 0; } #endif int acpi_get_local_address(acpi_handle handle, u32 *addr); const char *acpi_get_subsystem_id(acpi_handle handle); #else /* !CONFIG_ACPI */ #define acpi_disabled 1 #define ACPI_COMPANION(dev) (NULL) #define ACPI_COMPANION_SET(dev, adev) do { } while (0) #define ACPI_HANDLE(dev) (NULL) #define ACPI_HANDLE_FWNODE(fwnode) (NULL) #include <acpi/acpi_numa.h> struct fwnode_handle; static inline bool acpi_dev_found(const char *hid) { return false; } static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv) { return false; } struct acpi_device; static inline bool acpi_dev_hid_uid_match(struct acpi_device *adev, const char *hid2, const char *uid2) { return false; } static inline int acpi_dev_uid_to_integer(struct acpi_device *adev, u64 *integer) { return -ENODEV; } static inline struct acpi_device * acpi_dev_get_first_match_dev(const char *hid, const char *uid, s64 hrv) { return NULL; } static inline bool acpi_reduced_hardware(void) { return false; } static inline void acpi_dev_put(struct acpi_device *adev) {} static inline bool is_acpi_node(const struct fwnode_handle *fwnode) { return false; } static inline bool is_acpi_device_node(const struct fwnode_handle *fwnode) { return false; } static inline struct acpi_device *to_acpi_device_node(const struct fwnode_handle *fwnode) { return NULL; } static inline bool is_acpi_data_node(const struct fwnode_handle *fwnode) { return false; } static inline struct acpi_data_node *to_acpi_data_node(const struct fwnode_handle *fwnode) { return NULL; } static inline bool acpi_data_node_match(const struct fwnode_handle *fwnode, const char *name) { return false; } static inline struct fwnode_handle *acpi_fwnode_handle(struct acpi_device *adev) { return NULL; } static inline bool has_acpi_companion(struct device *dev) { return false; } static inline void acpi_preset_companion(struct device *dev, struct acpi_device *parent, u64 addr) { } static inline const char *acpi_dev_name(struct acpi_device *adev) { return NULL; } static inline struct device *acpi_get_first_physical_node(struct acpi_device *adev) { return NULL; } static inline void acpi_early_init(void) { } static inline void acpi_subsystem_init(void) { } static inline int early_acpi_boot_init(void) { return 0; } static inline int acpi_boot_init(void) { return 0; } static inline void acpi_boot_table_prepare(void) { } static inline void acpi_boot_table_init(void) { } static inline int acpi_mps_check(void) { return 0; } static inline int acpi_check_resource_conflict(struct resource *res) { return 0; } static inline int acpi_check_region(resource_size_t start, resource_size_t n, const char *name) { return 0; } struct acpi_table_header; static inline int acpi_table_parse(char *id, int (*handler)(struct acpi_table_header *)) { return -ENODEV; } static inline int acpi_nvs_register(__u64 start, __u64 size) { return 0; } static inline int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *), void *data) { return 0; } struct acpi_device_id; static inline const struct acpi_device_id *acpi_match_acpi_device( const struct acpi_device_id *ids, const struct acpi_device *adev) { return NULL; } static inline const struct acpi_device_id *acpi_match_device( const struct acpi_device_id *ids, const struct device *dev) { return NULL; } static inline const void *acpi_device_get_match_data(const struct device *dev) { return NULL; } static inline bool acpi_driver_match_device(struct device *dev, const struct device_driver *drv) { return false; } static inline bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs) { return false; } static inline union acpi_object *acpi_evaluate_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4) { return NULL; } static inline union acpi_object *acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4, acpi_object_type type) { return NULL; } static inline int acpi_device_uevent_modalias(const struct device *dev, struct kobj_uevent_env *env) { return -ENODEV; } static inline int acpi_device_modalias(struct device *dev, char *buf, int size) { return -ENODEV; } static inline struct platform_device * acpi_create_platform_device(struct acpi_device *adev, const struct property_entry *properties) { return NULL; } static inline bool acpi_dma_supported(const struct acpi_device *adev) { return false; } static inline enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev) { return DEV_DMA_NOT_SUPPORTED; } static inline int acpi_dma_get_range(struct device *dev, const struct bus_dma_region **map) { return -ENODEV; } static inline int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr) { return 0; } static inline int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr, const u32 *input_id) { return 0; } #define ACPI_PTR(_ptr) (NULL) static inline void acpi_device_set_enumerated(struct acpi_device *adev) { } static inline void acpi_device_clear_enumerated(struct acpi_device *adev) { } static inline int acpi_reconfig_notifier_register(struct notifier_block *nb) { return -EINVAL; } static inline int acpi_reconfig_notifier_unregister(struct notifier_block *nb) { return -EINVAL; } static inline struct acpi_device *acpi_resource_consumer(struct resource *res) { return NULL; } static inline int acpi_get_local_address(acpi_handle handle, u32 *addr) { return -ENODEV; } static inline const char *acpi_get_subsystem_id(acpi_handle handle) { return ERR_PTR(-ENODEV); } static inline int acpi_register_wakeup_handler(int wake_irq, bool (*wakeup)(void *context), void *context) { return -ENXIO; } static inline void acpi_unregister_wakeup_handler( bool (*wakeup)(void *context), void *context) { } struct acpi_osc_context; static inline u32 acpi_osc_ctx_get_pci_control(struct acpi_osc_context *context) { return 0; } static inline u32 acpi_osc_ctx_get_cxl_control(struct acpi_osc_context *context) { return 0; } static inline bool acpi_sleep_state_supported(u8 sleep_state) { return false; } #endif /* !CONFIG_ACPI */ extern void arch_post_acpi_subsys_init(void); #ifdef CONFIG_ACPI_HOTPLUG_IOAPIC int acpi_ioapic_add(acpi_handle root); #else static inline int acpi_ioapic_add(acpi_handle root) { return 0; } #endif #ifdef CONFIG_ACPI void acpi_os_set_prepare_sleep(int (*func)(u8 sleep_state, u32 pm1a_ctrl, u32 pm1b_ctrl)); acpi_status acpi_os_prepare_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control); void acpi_os_set_prepare_extended_sleep(int (*func)(u8 sleep_state, u32 val_a, u32 val_b)); acpi_status acpi_os_prepare_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b); #if defined(CONFIG_SUSPEND) && defined(CONFIG_X86) struct acpi_s2idle_dev_ops { struct list_head list_node; void (*prepare)(void); void (*check)(void); void (*restore)(void); }; int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg); void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg); int acpi_get_lps0_constraint(struct acpi_device *adev); #else /* CONFIG_SUSPEND && CONFIG_X86 */ static inline int acpi_get_lps0_constraint(struct device *dev) { return ACPI_STATE_UNKNOWN; } #endif /* CONFIG_SUSPEND && CONFIG_X86 */ #ifndef CONFIG_IA64 void arch_reserve_mem_area(acpi_physical_address addr, size_t size); #else static inline void arch_reserve_mem_area(acpi_physical_address addr, size_t size) { } #endif /* CONFIG_X86 */ #else #define acpi_os_set_prepare_sleep(func, pm1a_ctrl, pm1b_ctrl) do { } while (0) #endif #if defined(CONFIG_ACPI) && defined(CONFIG_PM) int acpi_dev_suspend(struct device *dev, bool wakeup); int acpi_dev_resume(struct device *dev); int acpi_subsys_runtime_suspend(struct device *dev); int acpi_subsys_runtime_resume(struct device *dev); int acpi_dev_pm_attach(struct device *dev, bool power_on); bool acpi_storage_d3(struct device *dev); bool acpi_dev_state_d0(struct device *dev); #else static inline int acpi_subsys_runtime_suspend(struct device *dev) { return 0; } static inline int acpi_subsys_runtime_resume(struct device *dev) { return 0; } static inline int acpi_dev_pm_attach(struct device *dev, bool power_on) { return 0; } static inline bool acpi_storage_d3(struct device *dev) { return false; } static inline bool acpi_dev_state_d0(struct device *dev) { return true; } #endif #if defined(CONFIG_ACPI) && defined(CONFIG_PM_SLEEP) int acpi_subsys_prepare(struct device *dev); void acpi_subsys_complete(struct device *dev); int acpi_subsys_suspend_late(struct device *dev); int acpi_subsys_suspend_noirq(struct device *dev); int acpi_subsys_suspend(struct device *dev); int acpi_subsys_freeze(struct device *dev); int acpi_subsys_poweroff(struct device *dev); void acpi_ec_mark_gpe_for_wake(void); void acpi_ec_set_gpe_wake_mask(u8 action); int acpi_subsys_restore_early(struct device *dev); #else static inline int acpi_subsys_prepare(struct device *dev) { return 0; } static inline void acpi_subsys_complete(struct device *dev) {} static inline int acpi_subsys_suspend_late(struct device *dev) { return 0; } static inline int acpi_subsys_suspend_noirq(struct device *dev) { return 0; } static inline int acpi_subsys_suspend(struct device *dev) { return 0; } static inline int acpi_subsys_freeze(struct device *dev) { return 0; } static inline int acpi_subsys_poweroff(struct device *dev) { return 0; } static inline int acpi_subsys_restore_early(struct device *dev) { return 0; } static inline void acpi_ec_mark_gpe_for_wake(void) {} static inline void acpi_ec_set_gpe_wake_mask(u8 action) {} #endif #ifdef CONFIG_ACPI __printf(3, 4) void acpi_handle_printk(const char *level, acpi_handle handle, const char *fmt, ...); void acpi_evaluation_failure_warn(acpi_handle handle, const char *name, acpi_status status); #else /* !CONFIG_ACPI */ static inline __printf(3, 4) void acpi_handle_printk(const char *level, void *handle, const char *fmt, ...) {} static inline void acpi_evaluation_failure_warn(acpi_handle handle, const char *name, acpi_status status) {} #endif /* !CONFIG_ACPI */ #if defined(CONFIG_ACPI) && defined(CONFIG_DYNAMIC_DEBUG) __printf(3, 4) void __acpi_handle_debug(struct _ddebug *descriptor, acpi_handle handle, const char *fmt, ...); #endif /* * acpi_handle_<level>: Print message with ACPI prefix and object path * * These interfaces acquire the global namespace mutex to obtain an object * path. In interrupt context, it shows the object path as <n/a>. */ #define acpi_handle_emerg(handle, fmt, ...) \ acpi_handle_printk(KERN_EMERG, handle, fmt, ##__VA_ARGS__) #define acpi_handle_alert(handle, fmt, ...) \ acpi_handle_printk(KERN_ALERT, handle, fmt, ##__VA_ARGS__) #define acpi_handle_crit(handle, fmt, ...) \ acpi_handle_printk(KERN_CRIT, handle, fmt, ##__VA_ARGS__) #define acpi_handle_err(handle, fmt, ...) \ acpi_handle_printk(KERN_ERR, handle, fmt, ##__VA_ARGS__) #define acpi_handle_warn(handle, fmt, ...) \ acpi_handle_printk(KERN_WARNING, handle, fmt, ##__VA_ARGS__) #define acpi_handle_notice(handle, fmt, ...) \ acpi_handle_printk(KERN_NOTICE, handle, fmt, ##__VA_ARGS__) #define acpi_handle_info(handle, fmt, ...) \ acpi_handle_printk(KERN_INFO, handle, fmt, ##__VA_ARGS__) #if defined(DEBUG) #define acpi_handle_debug(handle, fmt, ...) \ acpi_handle_printk(KERN_DEBUG, handle, fmt, ##__VA_ARGS__) #else #if defined(CONFIG_DYNAMIC_DEBUG) #define acpi_handle_debug(handle, fmt, ...) \ _dynamic_func_call(fmt, __acpi_handle_debug, \ handle, pr_fmt(fmt), ##__VA_ARGS__) #else #define acpi_handle_debug(handle, fmt, ...) \ ({ \ if (0) \ acpi_handle_printk(KERN_DEBUG, handle, fmt, ##__VA_ARGS__); \ 0; \ }) #endif #endif #if defined(CONFIG_ACPI) && defined(CONFIG_GPIOLIB) bool acpi_gpio_get_irq_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio); bool acpi_gpio_get_io_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio); int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *name, int index, bool *wake_capable); #else static inline bool acpi_gpio_get_irq_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio) { return false; } static inline bool acpi_gpio_get_io_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio) { return false; } static inline int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *name, int index, bool *wake_capable) { return -ENXIO; } #endif static inline int acpi_dev_gpio_irq_wake_get(struct acpi_device *adev, int index, bool *wake_capable) { return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, wake_capable); } static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *name, int index) { return acpi_dev_gpio_irq_wake_get_by(adev, name, index, NULL); } static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index) { return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, NULL); } /* Device properties */ #ifdef CONFIG_ACPI int acpi_dev_get_property(const struct acpi_device *adev, const char *name, acpi_object_type type, const union acpi_object **obj); int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, const char *name, size_t index, size_t num_args, struct fwnode_reference_args *args); static inline int acpi_node_get_property_reference( const struct fwnode_handle *fwnode, const char *name, size_t index, struct fwnode_reference_args *args) { return __acpi_node_get_property_reference(fwnode, name, index, NR_FWNODE_REFERENCE_ARGS, args); } static inline bool acpi_dev_has_props(const struct acpi_device *adev) { return !list_empty(&adev->data.properties); } struct acpi_device_properties * acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid, union acpi_object *properties); int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, void **valptr); struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child); struct acpi_probe_entry; typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *, struct acpi_probe_entry *); #define ACPI_TABLE_ID_LEN 5 /** * struct acpi_probe_entry - boot-time probing entry * @id: ACPI table name * @type: Optional subtable type to match * (if @id contains subtables) * @subtable_valid: Optional callback to check the validity of * the subtable * @probe_table: Callback to the driver being probed when table * match is successful * @probe_subtbl: Callback to the driver being probed when table and * subtable match (and optional callback is successful) * @driver_data: Sideband data provided back to the driver */ struct acpi_probe_entry { __u8 id[ACPI_TABLE_ID_LEN]; __u8 type; acpi_probe_entry_validate_subtbl subtable_valid; union { acpi_tbl_table_handler probe_table; acpi_tbl_entry_handler probe_subtbl; }; kernel_ulong_t driver_data; }; #define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, \ valid, data, fn) \ static const struct acpi_probe_entry __acpi_probe_##name \ __used __section("__" #table "_acpi_probe_table") = { \ .id = table_id, \ .type = subtable, \ .subtable_valid = valid, \ .probe_table = fn, \ .driver_data = data, \ } #define ACPI_DECLARE_SUBTABLE_PROBE_ENTRY(table, name, table_id, \ subtable, valid, data, fn) \ static const struct acpi_probe_entry __acpi_probe_##name \ __used __section("__" #table "_acpi_probe_table") = { \ .id = table_id, \ .type = subtable, \ .subtable_valid = valid, \ .probe_subtbl = fn, \ .driver_data = data, \ } #define ACPI_PROBE_TABLE(name) __##name##_acpi_probe_table #define ACPI_PROBE_TABLE_END(name) __##name##_acpi_probe_table_end int __acpi_probe_device_table(struct acpi_probe_entry *start, int nr); #define acpi_probe_device_table(t) \ ({ \ extern struct acpi_probe_entry ACPI_PROBE_TABLE(t), \ ACPI_PROBE_TABLE_END(t); \ __acpi_probe_device_table(&ACPI_PROBE_TABLE(t), \ (&ACPI_PROBE_TABLE_END(t) - \ &ACPI_PROBE_TABLE(t))); \ }) #else static inline int acpi_dev_get_property(struct acpi_device *adev, const char *name, acpi_object_type type, const union acpi_object **obj) { return -ENXIO; } static inline int __acpi_node_get_property_reference(const struct fwnode_handle *fwnode, const char *name, size_t index, size_t num_args, struct fwnode_reference_args *args) { return -ENXIO; } static inline int acpi_node_get_property_reference(const struct fwnode_handle *fwnode, const char *name, size_t index, struct fwnode_reference_args *args) { return -ENXIO; } static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, void **valptr) { return -ENXIO; } static inline struct fwnode_handle * acpi_get_next_subnode(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { return NULL; } static inline struct fwnode_handle * acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle *prev) { return ERR_PTR(-ENXIO); } static inline int acpi_graph_get_remote_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle **remote, struct fwnode_handle **port, struct fwnode_handle **endpoint) { return -ENXIO; } #define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, valid, data, fn) \ static const void * __acpi_table_##name[] \ __attribute__((unused)) \ = { (void *) table_id, \ (void *) subtable, \ (void *) valid, \ (void *) fn, \ (void *) data } #define acpi_probe_device_table(t) ({ int __r = 0; __r;}) #endif #ifdef CONFIG_ACPI_TABLE_UPGRADE void acpi_table_upgrade(void); #else static inline void acpi_table_upgrade(void) { } #endif #if defined(CONFIG_ACPI) && defined(CONFIG_ACPI_WATCHDOG) extern bool acpi_has_watchdog(void); #else static inline bool acpi_has_watchdog(void) { return false; } #endif #ifdef CONFIG_ACPI_SPCR_TABLE extern bool qdf2400_e44_present; int acpi_parse_spcr(bool enable_earlycon, bool enable_console); #else static inline int acpi_parse_spcr(bool enable_earlycon, bool enable_console) { return 0; } #endif #if IS_ENABLED(CONFIG_ACPI_GENERIC_GSI) int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res); #else static inline int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res) { return -EINVAL; } #endif #ifdef CONFIG_ACPI_LPIT int lpit_read_residency_count_address(u64 *address); #else static inline int lpit_read_residency_count_address(u64 *address) { return -EINVAL; } #endif #ifdef CONFIG_ACPI_PPTT int acpi_pptt_cpu_is_thread(unsigned int cpu); int find_acpi_cpu_topology(unsigned int cpu, int level); int find_acpi_cpu_topology_cluster(unsigned int cpu); int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); #else static inline int acpi_pptt_cpu_is_thread(unsigned int cpu) { return -EINVAL; } static inline int find_acpi_cpu_topology(unsigned int cpu, int level) { return -EINVAL; } static inline int find_acpi_cpu_topology_cluster(unsigned int cpu) { return -EINVAL; } static inline int find_acpi_cpu_topology_package(unsigned int cpu) { return -EINVAL; } static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu) { return -EINVAL; } #endif #ifdef CONFIG_ARM64 void acpi_arm_init(void); #else static inline void acpi_arm_init(void) { } #endif #ifdef CONFIG_ACPI_PCC void acpi_init_pcc(void); #else static inline void acpi_init_pcc(void) { } #endif #ifdef CONFIG_ACPI_FFH void acpi_init_ffh(void); extern int acpi_ffh_address_space_arch_setup(void *handler_ctxt, void **region_ctxt); extern int acpi_ffh_address_space_arch_handler(acpi_integer *value, void *region_context); #else static inline void acpi_init_ffh(void) { } #endif #ifdef CONFIG_ACPI extern void acpi_device_notify(struct device *dev); extern void acpi_device_notify_remove(struct device *dev); #else static inline void acpi_device_notify(struct device *dev) { } static inline void acpi_device_notify_remove(struct device *dev) { } #endif #endif /*_LINUX_ACPI_H*/
16 2684 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IVERSION_H #define _LINUX_IVERSION_H #include <linux/fs.h> /* * The inode->i_version field: * --------------------------- * The change attribute (i_version) is mandated by NFSv4 and is mostly for * knfsd, but is also used for other purposes (e.g. IMA). The i_version must * appear larger to observers if there was an explicit change to the inode's * data or metadata since it was last queried. * * An explicit change is one that would ordinarily result in a change to the * inode status change time (aka ctime). i_version must appear to change, even * if the ctime does not (since the whole point is to avoid missing updates due * to timestamp granularity). If POSIX or other relevant spec mandates that the * ctime must change due to an operation, then the i_version counter must be * incremented as well. * * Making the i_version update completely atomic with the operation itself would * be prohibitively expensive. Traditionally the kernel has updated the times on * directories after an operation that changes its contents. For regular files, * the ctime is usually updated before the data is copied into the cache for a * write. This means that there is a window of time when an observer can * associate a new timestamp with old file contents. Since the purpose of the * i_version is to allow for better cache coherency, the i_version must always * be updated after the results of the operation are visible. Updating it before * and after a change is also permitted. (Note that no filesystems currently do * this. Fixing that is a work-in-progress). * * Observers see the i_version as a 64-bit number that never decreases. If it * remains the same since it was last checked, then nothing has changed in the * inode. If it's different then something has changed. Observers cannot infer * anything about the nature or magnitude of the changes from the value, only * that the inode has changed in some fashion. * * Not all filesystems properly implement the i_version counter. Subsystems that * want to use i_version field on an inode should first check whether the * filesystem sets the SB_I_VERSION flag (usually via the IS_I_VERSION macro). * * Those that set SB_I_VERSION will automatically have their i_version counter * incremented on writes to normal files. If the SB_I_VERSION is not set, then * the VFS will not touch it on writes, and the filesystem can use it how it * wishes. Note that the filesystem is always responsible for updating the * i_version on namespace changes in directories (mkdir, rmdir, unlink, etc.). * We consider these sorts of filesystems to have a kernel-managed i_version. * * It may be impractical for filesystems to keep i_version updates atomic with * respect to the changes that cause them. They should, however, guarantee * that i_version updates are never visible before the changes that caused * them. Also, i_version updates should never be delayed longer than it takes * the original change to reach disk. * * This implementation uses the low bit in the i_version field as a flag to * track when the value has been queried. If it has not been queried since it * was last incremented, we can skip the increment in most cases. * * In the event that we're updating the ctime, we will usually go ahead and * bump the i_version anyway. Since that has to go to stable storage in some * fashion, we might as well increment it as well. * * With this implementation, the value should always appear to observers to * increase over time if the file has changed. It's recommended to use * inode_eq_iversion() helper to compare values. * * Note that some filesystems (e.g. NFS and AFS) just use the field to store * a server-provided value (for the most part). For that reason, those * filesystems do not set SB_I_VERSION. These filesystems are considered to * have a self-managed i_version. * * Persistently storing the i_version * ---------------------------------- * Queries of the i_version field are not gated on them hitting the backing * store. It's always possible that the host could crash after allowing * a query of the value but before it has made it to disk. * * To mitigate this problem, filesystems should always use * inode_set_iversion_queried when loading an existing inode from disk. This * ensures that the next attempted inode increment will result in the value * changing. * * Storing the value to disk therefore does not count as a query, so those * filesystems should use inode_peek_iversion to grab the value to be stored. * There is no need to flag the value as having been queried in that case. */ /* * We borrow the lowest bit in the i_version to use as a flag to tell whether * it has been queried since we last incremented it. If it has, then we must * increment it on the next change. After that, we can clear the flag and * avoid incrementing it again until it has again been queried. */ #define I_VERSION_QUERIED_SHIFT (1) #define I_VERSION_QUERIED (1ULL << (I_VERSION_QUERIED_SHIFT - 1)) #define I_VERSION_INCREMENT (1ULL << I_VERSION_QUERIED_SHIFT) /** * inode_set_iversion_raw - set i_version to the specified raw value * @inode: inode to set * @val: new i_version value to set * * Set @inode's i_version field to @val. This function is for use by * filesystems that self-manage the i_version. * * For example, the NFS client stores its NFSv4 change attribute in this way, * and the AFS client stores the data_version from the server here. */ static inline void inode_set_iversion_raw(struct inode *inode, u64 val) { atomic64_set(&inode->i_version, val); } /** * inode_peek_iversion_raw - grab a "raw" iversion value * @inode: inode from which i_version should be read * * Grab a "raw" inode->i_version value and return it. The i_version is not * flagged or converted in any way. This is mostly used to access a self-managed * i_version. * * With those filesystems, we want to treat the i_version as an entirely * opaque value. */ static inline u64 inode_peek_iversion_raw(const struct inode *inode) { return atomic64_read(&inode->i_version); } /** * inode_set_max_iversion_raw - update i_version new value is larger * @inode: inode to set * @val: new i_version to set * * Some self-managed filesystems (e.g Ceph) will only update the i_version * value if the new value is larger than the one we already have. */ static inline void inode_set_max_iversion_raw(struct inode *inode, u64 val) { u64 cur = inode_peek_iversion_raw(inode); do { if (cur > val) break; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, val)); } /** * inode_set_iversion - set i_version to a particular value * @inode: inode to set * @val: new i_version value to set * * Set @inode's i_version field to @val. This function is for filesystems with * a kernel-managed i_version, for initializing a newly-created inode from * scratch. * * In this case, we do not set the QUERIED flag since we know that this value * has never been queried. */ static inline void inode_set_iversion(struct inode *inode, u64 val) { inode_set_iversion_raw(inode, val << I_VERSION_QUERIED_SHIFT); } /** * inode_set_iversion_queried - set i_version to a particular value as quereied * @inode: inode to set * @val: new i_version value to set * * Set @inode's i_version field to @val, and flag it for increment on the next * change. * * Filesystems that persistently store the i_version on disk should use this * when loading an existing inode from disk. * * When loading in an i_version value from a backing store, we can't be certain * that it wasn't previously viewed before being stored. Thus, we must assume * that it was, to ensure that we don't end up handing out the same value for * different versions of the same inode. */ static inline void inode_set_iversion_queried(struct inode *inode, u64 val) { inode_set_iversion_raw(inode, (val << I_VERSION_QUERIED_SHIFT) | I_VERSION_QUERIED); } bool inode_maybe_inc_iversion(struct inode *inode, bool force); /** * inode_inc_iversion - forcibly increment i_version * @inode: inode that needs to be updated * * Forcbily increment the i_version field. This always results in a change to * the observable value. */ static inline void inode_inc_iversion(struct inode *inode) { inode_maybe_inc_iversion(inode, true); } /** * inode_iversion_need_inc - is the i_version in need of being incremented? * @inode: inode to check * * Returns whether the inode->i_version counter needs incrementing on the next * change. Just fetch the value and check the QUERIED flag. */ static inline bool inode_iversion_need_inc(struct inode *inode) { return inode_peek_iversion_raw(inode) & I_VERSION_QUERIED; } /** * inode_inc_iversion_raw - forcibly increment raw i_version * @inode: inode that needs to be updated * * Forcbily increment the raw i_version field. This always results in a change * to the raw value. * * NFS will use the i_version field to store the value from the server. It * mostly treats it as opaque, but in the case where it holds a write * delegation, it must increment the value itself. This function does that. */ static inline void inode_inc_iversion_raw(struct inode *inode) { atomic64_inc(&inode->i_version); } /** * inode_peek_iversion - read i_version without flagging it to be incremented * @inode: inode from which i_version should be read * * Read the inode i_version counter for an inode without registering it as a * query. * * This is typically used by local filesystems that need to store an i_version * on disk. In that situation, it's not necessary to flag it as having been * viewed, as the result won't be used to gauge changes from that point. */ static inline u64 inode_peek_iversion(const struct inode *inode) { return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT; } /* * For filesystems without any sort of change attribute, the best we can * do is fake one up from the ctime: */ static inline u64 time_to_chattr(struct timespec64 *t) { u64 chattr = t->tv_sec; chattr <<= 32; chattr += t->tv_nsec; return chattr; } u64 inode_query_iversion(struct inode *inode); /** * inode_eq_iversion_raw - check whether the raw i_version counter has changed * @inode: inode to check * @old: old value to check against its i_version * * Compare the current raw i_version counter with a previous one. Returns true * if they are the same or false if they are different. */ static inline bool inode_eq_iversion_raw(const struct inode *inode, u64 old) { return inode_peek_iversion_raw(inode) == old; } /** * inode_eq_iversion - check whether the i_version counter has changed * @inode: inode to check * @old: old value to check against its i_version * * Compare an i_version counter with a previous one. Returns true if they are * the same, and false if they are different. * * Note that we don't need to set the QUERIED flag in this case, as the value * in the inode is not being recorded for later use. */ static inline bool inode_eq_iversion(const struct inode *inode, u64 old) { return inode_peek_iversion(inode) == old; } #endif
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 /* SPDX-License-Identifier: GPL-2.0 */ /* * * Generic internet FLOW. * */ #ifndef _NET_FLOW_H #define _NET_FLOW_H #include <linux/in6.h> #include <linux/atomic.h> #include <linux/container_of.h> #include <linux/uidgid.h> struct flow_keys; /* * ifindex generation is per-net namespace, and loopback is * always the 1st device in ns (see net_dev_init), thus any * loopback device should get ifindex 1 */ #define LOOPBACK_IFINDEX 1 struct flowi_tunnel { __be64 tun_id; }; struct flowi_common { int flowic_oif; int flowic_iif; int flowic_l3mdev; __u32 flowic_mark; __u8 flowic_tos; __u8 flowic_scope; __u8 flowic_proto; __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 __u32 flowic_secid; kuid_t flowic_uid; struct flowi_tunnel flowic_tun_key; __u32 flowic_multipath_hash; }; union flowi_uli { struct { __be16 dport; __be16 sport; } ports; struct { __u8 type; __u8 code; } icmpt; __be32 gre_key; struct { __u8 type; } mht; }; struct flowi4 { struct flowi_common __fl_common; #define flowi4_oif __fl_common.flowic_oif #define flowi4_iif __fl_common.flowic_iif #define flowi4_l3mdev __fl_common.flowic_l3mdev #define flowi4_mark __fl_common.flowic_mark #define flowi4_tos __fl_common.flowic_tos #define flowi4_scope __fl_common.flowic_scope #define flowi4_proto __fl_common.flowic_proto #define flowi4_flags __fl_common.flowic_flags #define flowi4_secid __fl_common.flowic_secid #define flowi4_tun_key __fl_common.flowic_tun_key #define flowi4_uid __fl_common.flowic_uid #define flowi4_multipath_hash __fl_common.flowic_multipath_hash /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; __be32 daddr; union flowi_uli uli; #define fl4_sport uli.ports.sport #define fl4_dport uli.ports.dport #define fl4_icmp_type uli.icmpt.type #define fl4_icmp_code uli.icmpt.code #define fl4_mh_type uli.mht.type #define fl4_gre_key uli.gre_key } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline void flowi4_init_output(struct flowi4 *fl4, int oif, __u32 mark, __u8 tos, __u8 scope, __u8 proto, __u8 flags, __be32 daddr, __be32 saddr, __be16 dport, __be16 sport, kuid_t uid) { fl4->flowi4_oif = oif; fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_l3mdev = 0; fl4->flowi4_mark = mark; fl4->flowi4_tos = tos; fl4->flowi4_scope = scope; fl4->flowi4_proto = proto; fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; fl4->flowi4_tun_key.tun_id = 0; fl4->flowi4_uid = uid; fl4->daddr = daddr; fl4->saddr = saddr; fl4->fl4_dport = dport; fl4->fl4_sport = sport; fl4->flowi4_multipath_hash = 0; } /* Reset some input parameters after previous lookup */ static inline void flowi4_update_output(struct flowi4 *fl4, int oif, __be32 daddr, __be32 saddr) { fl4->flowi4_oif = oif; fl4->daddr = daddr; fl4->saddr = saddr; } struct flowi6 { struct flowi_common __fl_common; #define flowi6_oif __fl_common.flowic_oif #define flowi6_iif __fl_common.flowic_iif #define flowi6_l3mdev __fl_common.flowic_l3mdev #define flowi6_mark __fl_common.flowic_mark #define flowi6_scope __fl_common.flowic_scope #define flowi6_proto __fl_common.flowic_proto #define flowi6_flags __fl_common.flowic_flags #define flowi6_secid __fl_common.flowic_secid #define flowi6_tun_key __fl_common.flowic_tun_key #define flowi6_uid __fl_common.flowic_uid struct in6_addr daddr; struct in6_addr saddr; /* Note: flowi6_tos is encoded in flowlabel, too. */ __be32 flowlabel; union flowi_uli uli; #define fl6_sport uli.ports.sport #define fl6_dport uli.ports.dport #define fl6_icmp_type uli.icmpt.type #define fl6_icmp_code uli.icmpt.code #define fl6_mh_type uli.mht.type #define fl6_gre_key uli.gre_key __u32 mp_hash; } __attribute__((__aligned__(BITS_PER_LONG/8))); struct flowi { union { struct flowi_common __fl_common; struct flowi4 ip4; struct flowi6 ip6; } u; #define flowi_oif u.__fl_common.flowic_oif #define flowi_iif u.__fl_common.flowic_iif #define flowi_l3mdev u.__fl_common.flowic_l3mdev #define flowi_mark u.__fl_common.flowic_mark #define flowi_tos u.__fl_common.flowic_tos #define flowi_scope u.__fl_common.flowic_scope #define flowi_proto u.__fl_common.flowic_proto #define flowi_flags u.__fl_common.flowic_flags #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key #define flowi_uid u.__fl_common.flowic_uid } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) { return container_of(fl4, struct flowi, u.ip4); } static inline struct flowi_common *flowi4_to_flowi_common(struct flowi4 *fl4) { return &(fl4->__fl_common); } static inline struct flowi *flowi6_to_flowi(struct flowi6 *fl6) { return container_of(fl6, struct flowi, u.ip6); } static inline struct flowi_common *flowi6_to_flowi_common(struct flowi6 *fl6) { return &(fl6->__fl_common); } __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys); #endif
1438 1293 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HIGHMEM_INTERNAL_H #define _LINUX_HIGHMEM_INTERNAL_H /* * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. */ #ifdef CONFIG_KMAP_LOCAL void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); void *__kmap_local_page_prot(struct page *page, pgprot_t prot); void kunmap_local_indexed(const void *vaddr); void kmap_local_fork(struct task_struct *tsk); void __kmap_local_sched_out(void); void __kmap_local_sched_in(void); static inline void kmap_assert_nomap(void) { DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx); } #else static inline void kmap_local_fork(struct task_struct *tsk) { } static inline void kmap_assert_nomap(void) { } #endif #ifdef CONFIG_HIGHMEM #include <asm/highmem.h> #ifndef ARCH_HAS_KMAP_FLUSH_TLB static inline void kmap_flush_tlb(unsigned long addr) { } #endif #ifndef kmap_prot #define kmap_prot PAGE_KERNEL #endif void *kmap_high(struct page *page); void kunmap_high(struct page *page); void __kmap_flush_unused(void); struct page *__kmap_to_page(void *addr); static inline void *kmap(struct page *page) { void *addr; might_sleep(); if (!PageHighMem(page)) addr = page_address(page); else addr = kmap_high(page); kmap_flush_tlb((unsigned long)addr); return addr; } static inline void kunmap(struct page *page) { might_sleep(); if (!PageHighMem(page)) return; kunmap_high(page); } static inline struct page *kmap_to_page(void *addr) { return __kmap_to_page(addr); } static inline void kmap_flush_unused(void) { __kmap_flush_unused(); } static inline void *kmap_local_page(struct page *page) { return __kmap_local_page_prot(page, kmap_prot); } static inline void *kmap_local_folio(struct folio *folio, size_t offset) { struct page *page = folio_page(folio, offset / PAGE_SIZE); return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE; } static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return __kmap_local_page_prot(page, prot); } static inline void *kmap_local_pfn(unsigned long pfn) { return __kmap_local_pfn_prot(pfn, kmap_prot); } static inline void __kunmap_local(const void *vaddr) { kunmap_local_indexed(vaddr); } static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return __kmap_local_page_prot(page, prot); } static inline void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); } static inline void *kmap_atomic_pfn(unsigned long pfn) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return __kmap_local_pfn_prot(pfn, kmap_prot); } static inline void __kunmap_atomic(const void *addr) { kunmap_local_indexed(addr); pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_enable(); else preempt_enable(); } unsigned int __nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; static inline unsigned int nr_free_highpages(void) { return __nr_free_highpages(); } static inline unsigned long totalhigh_pages(void) { return (unsigned long)atomic_long_read(&_totalhigh_pages); } static inline void totalhigh_pages_add(long count) { atomic_long_add(count, &_totalhigh_pages); } static inline bool is_kmap_addr(const void *x) { unsigned long addr = (unsigned long)x; return (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) || (addr >= __fix_to_virt(FIX_KMAP_END) && addr < __fix_to_virt(FIX_KMAP_BEGIN)); } #else /* CONFIG_HIGHMEM */ static inline struct page *kmap_to_page(void *addr) { return virt_to_page(addr); } static inline void *kmap(struct page *page) { might_sleep(); return page_address(page); } static inline void kunmap_high(struct page *page) { } static inline void kmap_flush_unused(void) { } static inline void kunmap(struct page *page) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(page_address(page)); #endif } static inline void *kmap_local_page(struct page *page) { return page_address(page); } static inline void *kmap_local_folio(struct folio *folio, size_t offset) { return page_address(&folio->page) + offset; } static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) { return kmap_local_page(page); } static inline void *kmap_local_pfn(unsigned long pfn) { return kmap_local_page(pfn_to_page(pfn)); } static inline void __kunmap_local(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif } static inline void *kmap_atomic(struct page *page) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); else preempt_disable(); pagefault_disable(); return page_address(page); } static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { return kmap_atomic(page); } static inline void *kmap_atomic_pfn(unsigned long pfn) { return kmap_atomic(pfn_to_page(pfn)); } static inline void __kunmap_atomic(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE)); #endif pagefault_enable(); if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_enable(); else preempt_enable(); } static inline unsigned int nr_free_highpages(void) { return 0; } static inline unsigned long totalhigh_pages(void) { return 0UL; } static inline bool is_kmap_addr(const void *x) { return false; } #endif /* CONFIG_HIGHMEM */ /** * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() - deprecated! * @__addr: Virtual address to be unmapped * * Unmaps an address previously mapped by kmap_atomic() and re-enables * pagefaults. Depending on PREEMP_RT configuration, re-enables also * migration and preemption. Users should not count on these side effects. * * Mappings should be unmapped in the reverse order that they were mapped. * See kmap_local_page() for details on nesting. * * @__addr can be any address within the mapped page, so there is no need * to subtract any offset that has been added. In contrast to kunmap(), * this function takes the address returned from kmap_atomic(), not the * page passed to it. The compiler will warn you if you pass the page. */ #define kunmap_atomic(__addr) \ do { \ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ __kunmap_atomic(__addr); \ } while (0) /** * kunmap_local - Unmap a page mapped via kmap_local_page(). * @__addr: An address within the page mapped * * @__addr can be any address within the mapped page. Commonly it is the * address return from kmap_local_page(), but it can also include offsets. * * Unmapping should be done in the reverse order of the mapping. See * kmap_local_page() for details. */ #define kunmap_local(__addr) \ do { \ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ __kunmap_local(__addr); \ } while (0) #endif
1196 1196 1178 1197 1195 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/symlink.c - kernfs symlink implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/gfp.h> #include <linux/namei.h> #include "kernfs-internal.h" /** * kernfs_create_link - create a symlink * @parent: directory to create the symlink in * @name: name of the symlink * @target: target node for the symlink to point to * * Return: the created node on success, ERR_PTR() value on error. * Ownership of the link matches ownership of the target. */ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target) { struct kernfs_node *kn; int error; kuid_t uid = GLOBAL_ROOT_UID; kgid_t gid = GLOBAL_ROOT_GID; if (target->iattr) { uid = target->iattr->ia_uid; gid = target->iattr->ia_gid; } kn = kernfs_new_node(parent, name, S_IFLNK|0777, uid, gid, KERNFS_LINK); if (!kn) return ERR_PTR(-ENOMEM); if (kernfs_ns_enabled(parent)) kn->ns = target->ns; kn->symlink.target_kn = target; kernfs_get(target); /* ref owned by symlink */ error = kernfs_add_one(kn); if (!error) return kn; kernfs_put(kn); return ERR_PTR(error); } static int kernfs_get_target_path(struct kernfs_node *parent, struct kernfs_node *target, char *path) { struct kernfs_node *base, *kn; char *s = path; int len = 0; /* go up to the root, stop at the base */ base = parent; while (base->parent) { kn = target->parent; while (kn->parent && base != kn) kn = kn->parent; if (base == kn) break; if ((s - path) + 3 >= PATH_MAX) return -ENAMETOOLONG; strcpy(s, "../"); s += 3; base = base->parent; } /* determine end of target string for reverse fillup */ kn = target; while (kn->parent && kn != base) { len += strlen(kn->name) + 1; kn = kn->parent; } /* check limits */ if (len < 2) return -EINVAL; len--; if ((s - path) + len >= PATH_MAX) return -ENAMETOOLONG; /* reverse fillup of target string from target to base */ kn = target; while (kn->parent && kn != base) { int slen = strlen(kn->name); len -= slen; memcpy(s + len, kn->name, slen); if (len) s[--len] = '/'; kn = kn->parent; } return 0; } static int kernfs_getlink(struct inode *inode, char *path) { struct kernfs_node *kn = inode->i_private; struct kernfs_node *parent = kn->parent; struct kernfs_node *target = kn->symlink.target_kn; struct kernfs_root *root = kernfs_root(parent); int error; down_read(&root->kernfs_rwsem); error = kernfs_get_target_path(parent, target, path); up_read(&root->kernfs_rwsem); return error; } static const char *kernfs_iop_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { char *body; int error; if (!dentry) return ERR_PTR(-ECHILD); body = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!body) return ERR_PTR(-ENOMEM); error = kernfs_getlink(inode, body); if (unlikely(error < 0)) { kfree(body); return ERR_PTR(error); } set_delayed_call(done, kfree_link, body); return body; } const struct inode_operations kernfs_symlink_iops = { .listxattr = kernfs_iop_listxattr, .get_link = kernfs_iop_get_link, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .permission = kernfs_iop_permission, };
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/exec.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * #!-checking implemented by tytso. */ /* * Demand-loading implemented 01.12.91 - no need to read anything but * the header into memory. The inode of the executable is put into * "current->executable", and page faults do the actual loading. Clean. * * Once more I can proudly say that linux stood up to being changed: it * was less than 2 hours work to get demand-loading completely implemented. * * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary * formats. */ #include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/signal.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/perf_event.h> #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/key.h> #include <linux/personality.h> #include <linux/binfmts.h> #include <linux/utsname.h> #include <linux/pid_namespace.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> #include <linux/time_namespace.h> #include <linux/user_events.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> #include <trace/events/task.h> #include "internal.h" #include <trace/events/sched.h> static int bprm_creds_from_file(struct linux_binprm *bprm); int suid_dumpable = 0; static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(__register_binfmt); void unregister_binfmt(struct linux_binfmt * fmt) { write_lock(&binfmt_lock); list_del(&fmt->lh); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(unregister_binfmt); static inline void put_binfmt(struct linux_binfmt * fmt) { module_put(fmt->module); } bool path_noexec(const struct path *path) { return (path->mnt->mnt_flags & MNT_NOEXEC) || (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); } #ifdef CONFIG_USELIB /* * Note that a shared library must be both readable and executable due to * security reasons. * * Also note that we take the address to load from the file itself. */ SYSCALL_DEFINE1(uselib, const char __user *, library) { struct linux_binfmt *fmt; struct file *file; struct filename *tmp = getname(library); int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if (IS_ERR(tmp)) goto out; file = do_filp_open(AT_FDCWD, tmp, &uselib_flags); putname(tmp); error = PTR_ERR(file); if (IS_ERR(file)) goto out; /* * may_open() has already checked for this, so it should be * impossible to trip now. But we need to be extra cautious * and check again at the very end too. */ error = -EACCES; if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) || path_noexec(&file->f_path))) goto exit; error = -ENOEXEC; read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!fmt->load_shlib) continue; if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); error = fmt->load_shlib(file); read_lock(&binfmt_lock); put_binfmt(fmt); if (error != -ENOEXEC) break; } read_unlock(&binfmt_lock); exit: fput(file); out: return error; } #endif /* #ifdef CONFIG_USELIB */ #ifdef CONFIG_MMU /* * The nascent bprm->mm is not visible until exec_mmap() but it can * use a lot of memory, account these pages in current->mm temporary * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we * change the counter back via acct_arg_size(0). */ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { struct mm_struct *mm = current->mm; long diff = (long)(pages - bprm->vma_pages); if (!mm || !diff) return; bprm->vma_pages = pages; add_mm_counter(mm, MM_ANONPAGES, diff); } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; struct vm_area_struct *vma = bprm->vma; struct mm_struct *mm = bprm->mm; int ret; /* * Avoid relying on expanding the stack down in GUP (which * does not work for STACK_GROWSUP anyway), and just do it * by hand ahead of time. */ if (write && pos < vma->vm_start) { mmap_write_lock(mm); ret = expand_downwards(vma, pos); if (unlikely(ret < 0)) { mmap_write_unlock(mm); return NULL; } mmap_write_downgrade(mm); } else mmap_read_lock(mm); /* * We are doing an exec(). 'current' is the process * doing the exec and 'mm' is the new process's mm. */ ret = get_user_pages_remote(mm, pos, 1, write ? FOLL_WRITE : 0, &page, NULL); mmap_read_unlock(mm); if (ret <= 0) return NULL; if (write) acct_arg_size(bprm, vma_pages(vma)); return page; } static void put_arg_page(struct page *page) { put_page(page); } static void free_arg_pages(struct linux_binprm *bprm) { } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { flush_cache_page(bprm->vma, pos, page_to_pfn(page)); } static int __bprm_mm_init(struct linux_binprm *bprm) { int err; struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; bprm->vma = vma = vm_area_alloc(mm); if (!vma) return -ENOMEM; vma_set_anonymous(vma); if (mmap_write_lock_killable(mm)) { err = -EINTR; goto err_free; } /* * Place the stack at the largest stack address the architecture * supports. Later, we'll move this to an appropriate place. We don't * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); if (err) goto err; mm->stack_vm = mm->total_vm = 1; mmap_write_unlock(mm); bprm->p = vma->vm_end - sizeof(void *); return 0; err: mmap_write_unlock(mm); err_free: bprm->vma = NULL; vm_area_free(vma); return err; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= MAX_ARG_STRLEN; } #else static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; page = bprm->page[pos / PAGE_SIZE]; if (!page && write) { page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); if (!page) return NULL; bprm->page[pos / PAGE_SIZE] = page; } return page; } static void put_arg_page(struct page *page) { } static void free_arg_page(struct linux_binprm *bprm, int i) { if (bprm->page[i]) { __free_page(bprm->page[i]); bprm->page[i] = NULL; } } static void free_arg_pages(struct linux_binprm *bprm) { int i; for (i = 0; i < MAX_ARG_PAGES; i++) free_arg_page(bprm, i); } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { } static int __bprm_mm_init(struct linux_binprm *bprm) { bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); return 0; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= bprm->p; } #endif /* CONFIG_MMU */ /* * Create a new mm_struct and populate it with a temporary stack * vm_area_struct. We don't have enough context at this point to set the stack * flags, permissions, and offset, so we use temporary values. We'll update * them later in setup_arg_pages(). */ static int bprm_mm_init(struct linux_binprm *bprm) { int err; struct mm_struct *mm = NULL; bprm->mm = mm = mm_alloc(); err = -ENOMEM; if (!mm) goto err; /* Save current stack limit for all calculations made during exec. */ task_lock(current->group_leader); bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; task_unlock(current->group_leader); err = __bprm_mm_init(bprm); if (err) goto err; return 0; err: if (mm) { bprm->mm = NULL; mmdrop(mm); } return err; } struct user_arg_ptr { #ifdef CONFIG_COMPAT bool is_compat; #endif union { const char __user *const __user *native; #ifdef CONFIG_COMPAT const compat_uptr_t __user *compat; #endif } ptr; }; static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) { const char __user *native; #ifdef CONFIG_COMPAT if (unlikely(argv.is_compat)) { compat_uptr_t compat; if (get_user(compat, argv.ptr.compat + nr)) return ERR_PTR(-EFAULT); return compat_ptr(compat); } #endif if (get_user(native, argv.ptr.native + nr)) return ERR_PTR(-EFAULT); return native; } /* * count() counts the number of strings in array ARGV. */ static int count(struct user_arg_ptr argv, int max) { int i = 0; if (argv.ptr.native != NULL) { for (;;) { const char __user *p = get_user_arg_ptr(argv, i); if (!p) break; if (IS_ERR(p)) return -EFAULT; if (i >= max) return -E2BIG; ++i; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } } return i; } static int count_strings_kernel(const char *const *argv) { int i; if (!argv) return 0; for (i = 0; argv[i]; ++i) { if (i >= MAX_ARG_STRINGS) return -E2BIG; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return i; } static int bprm_stack_limits(struct linux_binprm *bprm) { unsigned long limit, ptr_size; /* * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM * (whichever is smaller) for the argv+env strings. * This ensures that: * - the remaining binfmt code will not run out of stack space, * - the program will have a reasonable amount of stack left * to work from. */ limit = _STK_LIM / 4 * 3; limit = min(limit, bprm->rlim_stack.rlim_cur / 4); /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks */ limit = max_t(unsigned long, limit, ARG_MAX); /* * We must account for the size of all the argv and envp pointers to * the argv and envp strings, since they will also take up space in * the stack. They aren't stored until much later when we can't * signal to the parent that the child has run out of stack space. * Instead, calculate it here so it's possible to fail gracefully. * * In the case of argc = 0, make sure there is space for adding a * empty string (which will bump argc to 1), to ensure confused * userspace programs don't start processing from argv[1], thinking * argc can never be 0, to keep them from walking envp by accident. * See do_execveat_common(). */ ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *); if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; bprm->argmin = bprm->p - limit; return 0; } /* * 'copy_strings()' copies argument/environment strings from the old * processes's memory to the new process's stack. The call to get_user_pages() * ensures the destination page is created and not swapped out. */ static int copy_strings(int argc, struct user_arg_ptr argv, struct linux_binprm *bprm) { struct page *kmapped_page = NULL; char *kaddr = NULL; unsigned long kpos = 0; int ret; while (argc-- > 0) { const char __user *str; int len; unsigned long pos; ret = -EFAULT; str = get_user_arg_ptr(argv, argc); if (IS_ERR(str)) goto out; len = strnlen_user(str, MAX_ARG_STRLEN); if (!len) goto out; ret = -E2BIG; if (!valid_arg_len(bprm, len)) goto out; /* We're going to work our way backwards. */ pos = bprm->p; str += len; bprm->p -= len; #ifdef CONFIG_MMU if (bprm->p < bprm->argmin) goto out; #endif while (len > 0) { int offset, bytes_to_copy; if (fatal_signal_pending(current)) { ret = -ERESTARTNOHAND; goto out; } cond_resched(); offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; bytes_to_copy = offset; if (bytes_to_copy > len) bytes_to_copy = len; offset -= bytes_to_copy; pos -= bytes_to_copy; str -= bytes_to_copy; len -= bytes_to_copy; if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; page = get_arg_page(bprm, pos, 1); if (!page) { ret = -E2BIG; goto out; } if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap_local(kaddr); put_arg_page(kmapped_page); } kmapped_page = page; kaddr = kmap_local_page(kmapped_page); kpos = pos & PAGE_MASK; flush_arg_page(bprm, kpos, kmapped_page); } if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { ret = -EFAULT; goto out; } } } ret = 0; out: if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap_local(kaddr); put_arg_page(kmapped_page); } return ret; } /* * Copy and argument/environment string from the kernel to the processes stack. */ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) { int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */; unsigned long pos = bprm->p; if (len == 0) return -EFAULT; if (!valid_arg_len(bprm, len)) return -E2BIG; /* We're going to work our way backwards. */ arg += len; bprm->p -= len; if (IS_ENABLED(CONFIG_MMU) && bprm->p < bprm->argmin) return -E2BIG; while (len > 0) { unsigned int bytes_to_copy = min_t(unsigned int, len, min_not_zero(offset_in_page(pos), PAGE_SIZE)); struct page *page; pos -= bytes_to_copy; arg -= bytes_to_copy; len -= bytes_to_copy; page = get_arg_page(bprm, pos, 1); if (!page) return -E2BIG; flush_arg_page(bprm, pos & PAGE_MASK, page); memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy); put_arg_page(page); } return 0; } EXPORT_SYMBOL(copy_string_kernel); static int copy_strings_kernel(int argc, const char *const *argv, struct linux_binprm *bprm) { while (argc-- > 0) { int ret = copy_string_kernel(argv[argc], bprm); if (ret < 0) return ret; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return 0; } #ifdef CONFIG_MMU /* * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once * the binfmt code determines where the new stack should reside, we shift it to * its final location. The process proceeds as follows: * * 1) Use shift to calculate the new vma endpoints. * 2) Extend vma to cover both the old and new ranges. This ensures the * arguments passed to subsequent functions are consistent. * 3) Move vma's page tables to the new range. * 4) Free up any cleared pgd range. * 5) Shrink the vma to cover only the new range. */ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) { struct mm_struct *mm = vma->vm_mm; unsigned long old_start = vma->vm_start; unsigned long old_end = vma->vm_end; unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; VMA_ITERATOR(vmi, mm, new_start); struct vm_area_struct *next; struct mmu_gather tlb; BUG_ON(new_start > new_end); /* * ensure there are no vmas between where we want to go * and where we are */ if (vma != vma_next(&vmi)) return -EFAULT; vma_iter_prev_range(&vmi); /* * cover the whole range: [new_start, old_end) */ if (vma_expand(&vmi, vma, new_start, old_end, vma->vm_pgoff, NULL)) return -ENOMEM; /* * move the page tables downwards, on failure we rely on * process cleanup to remove whatever mess we made. */ if (length != move_page_tables(vma, old_start, vma, new_start, length, false)) return -ENOMEM; lru_add_drain(); tlb_gather_mmu(&tlb, mm); next = vma_next(&vmi); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, next ? next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch * the address space in [new_end, old_start) some architectures * have constraints on va-space that make this illegal (IA64) - * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, next ? next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb); vma_prev(&vmi); /* Shrink the vma to just the new range */ return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); } /* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added. */ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { unsigned long ret; unsigned long stack_shift; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; struct vm_area_struct *prev = NULL; unsigned long vm_flags; unsigned long stack_base; unsigned long stack_size; unsigned long stack_expand; unsigned long rlim_stack; struct mmu_gather tlb; struct vma_iterator vmi; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ stack_base = bprm->rlim_stack.rlim_max; stack_base = calc_max_stack_size(stack_base); /* Add space for stack randomization. */ stack_base += (STACK_RND_MASK << PAGE_SHIFT); /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) return -ENOMEM; stack_base = PAGE_ALIGN(stack_top - stack_base); stack_shift = vma->vm_start - stack_base; mm->arg_start = bprm->p - stack_shift; bprm->p = vma->vm_end - stack_shift; #else stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); if (unlikely(stack_top < mmap_min_addr) || unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) return -ENOMEM; stack_shift = vma->vm_end - stack_top; bprm->p -= stack_shift; mm->arg_start = bprm->p; #endif if (bprm->loader) bprm->loader -= stack_shift; bprm->exec -= stack_shift; if (mmap_write_lock_killable(mm)) return -EINTR; vm_flags = VM_STACK_FLAGS; /* * Adjust stack execute permissions; explicitly enable for * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone * (arch default) otherwise. */ if (unlikely(executable_stack == EXSTACK_ENABLE_X)) vm_flags |= VM_EXEC; else if (executable_stack == EXSTACK_DISABLE_X) vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; vma_iter_init(&vmi, mm, vma->vm_start); tlb_gather_mmu(&tlb, mm); ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); tlb_finish_mmu(&tlb); if (ret) goto out_unlock; BUG_ON(prev != vma); if (unlikely(vm_flags & VM_EXEC)) { pr_warn_once("process '%pD4' started with executable stack\n", bprm->file); } /* Move stack pages down in memory. */ if (stack_shift) { ret = shift_arg_pages(vma, stack_shift); if (ret) goto out_unlock; } /* mprotect_fixup is overkill to remove the temporary stack flags */ vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP); stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* * Align this down to a page boundary as expand_stack * will align it up. */ rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; stack_expand = min(rlim_stack, stack_size + stack_expand); #ifdef CONFIG_STACK_GROWSUP stack_base = vma->vm_start + stack_expand; #else stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; ret = expand_stack_locked(vma, stack_base); if (ret) ret = -EFAULT; out_unlock: mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(setup_arg_pages); #else /* * Transfer the program arguments and environment from the holding pages * onto the stack. The provided stack pointer is adjusted accordingly. */ int transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *sp_location) { unsigned long index, stop, sp; int ret = 0; stop = bprm->p >> PAGE_SHIFT; sp = *sp_location; for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0; char *src = kmap_local_page(bprm->page[index]) + offset; sp -= PAGE_SIZE - offset; if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0) ret = -EFAULT; kunmap_local(src); if (ret) goto out; } *sp_location = sp; out: return ret; } EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; int err; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return ERR_PTR(-EINVAL); if (flags & AT_SYMLINK_NOFOLLOW) open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) open_exec_flags.lookup_flags |= LOOKUP_EMPTY; file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) goto out; /* * may_open() has already checked for this, so it should be * impossible to trip now. But we need to be extra cautious * and check again at the very end too. */ err = -EACCES; if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) || path_noexec(&file->f_path))) goto exit; err = deny_write_access(file); if (err) goto exit; out: return file; exit: fput(file); return ERR_PTR(err); } struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); struct file *f = ERR_CAST(filename); if (!IS_ERR(filename)) { f = do_open_execat(AT_FDCWD, filename, 0); putname(filename); } return f; } EXPORT_SYMBOL(open_exec); #if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) flush_icache_user_range(addr, addr + len); return res; } EXPORT_SYMBOL(read_code); #endif /* * Maps the mm_struct mm into the current task struct. * On success, this function returns with exec_update_lock * held for writing. */ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); if (old_mm) sync_mm_rss(old_mm); ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) return ret; if (old_mm) { /* * If there is a pending fatal signal perhaps a signal * whose default action is to create a coredump get * out and die instead of going through with the exec. */ ret = mmap_read_lock_killable(old_mm); if (ret) { up_write(&tsk->signal->exec_update_lock); return ret; } } task_lock(tsk); membarrier_exec_mmap(mm); local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; mm_init_cid(mm); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for * lazy tlb mm refcounting when these are updated by context * switches. Not all architectures can handle irqs off over * activate_mm yet. */ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); lru_gen_add_mm(mm); task_unlock(tsk); lru_gen_use_mm(mm); if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); mm_update_next_owner(old_mm); mmput(old_mm); return 0; } mmdrop_lazy_tlb(active_mm); return 0; } static int de_thread(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; if (thread_group_empty(tsk)) goto no_thread_group; /* * Kill all other threads in the thread group. */ spin_lock_irq(lock); if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) { /* * Another group action in progress, just * return so that the signal is processed. */ spin_unlock_irq(lock); return -EAGAIN; } sig->group_exec_task = tsk; sig->notify_count = zap_other_threads(tsk); if (!thread_group_leader(tsk)) sig->notify_count--; while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); schedule(); if (__fatal_signal_pending(tsk)) goto killed; spin_lock_irq(lock); } spin_unlock_irq(lock); /* * At this point all other threads have exited, all we have to * do is to wait for the thread group leader to become inactive, * and to assume its PID: */ if (!thread_group_leader(tsk)) { struct task_struct *leader = tsk->group_leader; for (;;) { cgroup_threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); /* * Do this under tasklist_lock to ensure that * exit_notify() can't miss ->group_exec_task */ sig->notify_count = -1; if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); schedule(); if (__fatal_signal_pending(tsk)) goto killed; } /* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. * All the past CPU time is accumulated in signal_struct * from sister threads now dead. But in this non-leader * exec, nothing survives from the original leader thread, * whose birth marks the true age of this process now. * When we take on its identity by switching to its PID, we * also take its birthdate (always earlier than our own). */ tsk->start_time = leader->start_time; tsk->start_boottime = leader->start_boottime; BUG_ON(!same_thread_group(leader, tsk)); /* * An exec() starts a new thread group with the * TGID of the previous thread group. Rehash the * two threads with a switched PID, and release * the former thread group leader: */ /* Become a process group leader with the old leader's pid. * The old leader becomes a thread of the this thread group. */ exchange_tids(tsk, leader); transfer_pid(leader, tsk, PIDTYPE_TGID); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); list_replace_rcu(&leader->tasks, &tsk->tasks); list_replace_init(&leader->sibling, &tsk->sibling); tsk->group_leader = tsk; leader->group_leader = tsk; tsk->exit_signal = SIGCHLD; leader->exit_signal = -1; BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees * the tracer won't block again waiting for this thread. */ if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); release_task(leader); } sig->group_exec_task = NULL; sig->notify_count = 0; no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; BUG_ON(!thread_group_leader(tsk)); return 0; killed: /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exec_task = NULL; sig->notify_count = 0; read_unlock(&tasklist_lock); return -EAGAIN; } /* * This function makes sure the current process has its own signal table, * so that flush_signal_handlers can later reset the handlers without * disturbing other processes. (Other processes might share the signal * table via the CLONE_SIGHAND option to clone().) */ static int unshare_sighand(struct task_struct *me) { struct sighand_struct *oldsighand = me->sighand; if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* * This ->sighand is shared with the CLONE_SIGHAND * but not CLONE_THREAD task, switch to the new one. */ newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); if (!newsighand) return -ENOMEM; refcount_set(&newsighand->count, 1); write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); memcpy(newsighand->action, oldsighand->action, sizeof(newsighand->action)); rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); __cleanup_sighand(oldsighand); } return 0; } char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) { task_lock(tsk); /* Always NUL terminated and zero-padded */ strscpy_pad(buf, tsk->comm, buf_size); task_unlock(tsk); return buf; } EXPORT_SYMBOL_GPL(__get_task_comm); /* * These functions flushes out all traces of the currently running executable * so that a new one can be started */ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { task_lock(tsk); trace_task_rename(tsk, buf); strscpy_pad(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk, exec); } /* * Calling this is the point of no return. None of the failures will be * seen by userspace since either the process is already taking a fatal * signal (via de_thread() or coredump), or will have SEGV raised * (after exec_mmap()) by search_binary_handler (see below). */ int begin_new_exec(struct linux_binprm * bprm) { struct task_struct *me = current; int retval; /* Once we are committed compute the creds */ retval = bprm_creds_from_file(bprm); if (retval) return retval; /* * Ensure all future errors are fatal. */ bprm->point_of_no_return = true; /* * Make this the only thread in the thread group. */ retval = de_thread(me); if (retval) goto out; /* * Cancel any io_uring activity across execve */ io_uring_task_cancel(); /* Ensure the files table is not shared. */ retval = unshare_files(); if (retval) goto out; /* * Must be called _before_ exec_mmap() as bprm->mm is * not visible until then. Doing it here also ensures * we don't race against replace_mm_exe_file(). */ retval = set_mm_exe_file(bprm->mm, bprm->file); if (retval) goto out; /* If the binary is not readable then enforce mm->dumpable=0 */ would_dump(bprm, bprm->file); if (bprm->have_execfd) would_dump(bprm, bprm->executable); /* * Release all of the old mmap stuff */ acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; bprm->mm = NULL; retval = exec_task_namespaces(); if (retval) goto out_unlock; #ifdef CONFIG_POSIX_TIMERS spin_lock_irq(&me->sighand->siglock); posix_cpu_timers_exit(me); spin_unlock_irq(&me->sighand->siglock); exit_itimers(me); flush_itimer_signals(); #endif /* * Make the signal table private. */ retval = unshare_sighand(me); if (retval) goto out_unlock; me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); me->personality &= ~bprm->per_clear; clear_syscall_work_syscall_user_dispatch(me); /* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace * trying to access the should-be-closed file descriptors of a process * undergoing exec(2). */ do_close_on_exec(me->files); if (bprm->secureexec) { /* Make sure parent cannot signal privileged process. */ me->pdeath_signal = 0; /* * For secureexec, reset the stack limit to sane default to * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid * needing to clean up the change on failure. */ if (bprm->rlim_stack.rlim_cur > _STK_LIM) bprm->rlim_stack.rlim_cur = _STK_LIM; } me->sas_ss_sp = me->sas_ss_size = 0; /* * Figure out dumpability. Note that this checking only of current * is wrong, but userspace depends on it. This should be testing * bprm->secureexec instead. */ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || !(uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))) set_dumpable(current->mm, suid_dumpable); else set_dumpable(current->mm, SUID_DUMP_USER); perf_event_exec(); __set_task_comm(me, kbasename(bprm->filename), true); /* An exec changes our domain. We are no longer part of the thread group */ WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); flush_signal_handlers(me, 0); retval = set_cred_ucounts(bprm->cred); if (retval < 0) goto out_unlock; /* * install the new credentials for this executable */ security_bprm_committing_creds(bprm); commit_creds(bprm->cred); bprm->cred = NULL; /* * Disable monitoring for regular users * when executing setuid binaries. Must * wait until new credentials are committed * by commit_creds() above */ if (get_dumpable(me->mm) != SUID_DUMP_USER) perf_event_exit_task(me); /* * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); /* Pass the opened binary to the interpreter. */ if (bprm->have_execfd) { retval = get_unused_fd_flags(0); if (retval < 0) goto out_unlock; fd_install(retval, bprm->executable); bprm->executable = NULL; bprm->execfd = retval; } return 0; out_unlock: up_write(&me->signal->exec_update_lock); out: return retval; } EXPORT_SYMBOL(begin_new_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { struct inode *inode = file_inode(file); struct mnt_idmap *idmap = file_mnt_idmap(file); if (inode_permission(idmap, inode, MAY_READ) < 0) { struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; /* Ensure mm->user_ns contains the executable */ user_ns = old = bprm->mm->user_ns; while ((user_ns != &init_user_ns) && !privileged_wrt_inode_uidgid(user_ns, idmap, inode)) user_ns = user_ns->parent; if (old != user_ns) { bprm->mm->user_ns = get_user_ns(user_ns); put_user_ns(old); } } } EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { /* Setup things that can depend upon the personality */ struct task_struct *me = current; arch_pick_mmap_layout(me->mm, &bprm->rlim_stack); arch_setup_new_exec(); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc */ me->mm->task_size = TASK_SIZE; up_write(&me->signal->exec_update_lock); mutex_unlock(&me->signal->cred_guard_mutex); } EXPORT_SYMBOL(setup_new_exec); /* Runs immediately before start_thread() takes over. */ void finalize_exec(struct linux_binprm *bprm) { /* Store any stack rlimit changes before starting thread. */ task_lock(current->group_leader); current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; task_unlock(current->group_leader); } EXPORT_SYMBOL(finalize_exec); /* * Prepare credentials and lock ->cred_guard_mutex. * setup_new_exec() commits the new creds and drops the lock. * Or, if exec fails before, free_bprm() should release ->cred * and unlock. */ static int prepare_bprm_creds(struct linux_binprm *bprm) { if (mutex_lock_interruptible(&current->signal->cred_guard_mutex)) return -ERESTARTNOINTR; bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; mutex_unlock(&current->signal->cred_guard_mutex); return -ENOMEM; } static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { acct_arg_size(bprm, 0); mmput(bprm->mm); } free_arg_pages(bprm); if (bprm->cred) { mutex_unlock(&current->signal->cred_guard_mutex); abort_creds(bprm->cred); } if (bprm->file) { allow_write_access(bprm->file); fput(bprm->file); } if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); kfree(bprm->fdpath); kfree(bprm); } static struct linux_binprm *alloc_bprm(int fd, struct filename *filename) { struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); int retval = -ENOMEM; if (!bprm) goto out; if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { if (filename->name[0] == '\0') bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); else bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", fd, filename->name); if (!bprm->fdpath) goto out_free; bprm->filename = bprm->fdpath; } bprm->interp = bprm->filename; retval = bprm_mm_init(bprm); if (retval) goto out_free; return bprm; out_free: free_bprm(bprm); out: return ERR_PTR(retval); } int bprm_change_interp(const char *interp, struct linux_binprm *bprm) { /* If a binfmt changed the interp, free it first. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); bprm->interp = kstrdup(interp, GFP_KERNEL); if (!bprm->interp) return -ENOMEM; return 0; } EXPORT_SYMBOL(bprm_change_interp); /* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH or seccomp thread-sync */ static void check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; if (p->ptrace) bprm->unsafe |= LSM_UNSAFE_PTRACE; /* * This isn't strictly necessary, but it makes it harder for LSMs to * mess up. */ if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; /* * If another task is sharing our fs, we cannot safely * suid exec because the differently privileged task * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... */ t = p; n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); while_each_thread(p, t) { if (t->fs == p->fs) n_fs++; } rcu_read_unlock(); if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else p->fs->in_exec = 1; spin_unlock(&p->fs->lock); } static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ struct mnt_idmap *idmap; struct inode *inode = file_inode(file); unsigned int mode; vfsuid_t vfsuid; vfsgid_t vfsgid; if (!mnt_may_suid(file->f_path.mnt)) return; if (task_no_new_privs(current)) return; mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; idmap = file_mnt_idmap(file); /* Be careful if suid/sgid is set */ inode_lock(inode); /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) || !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) return; if (mode & S_ISUID) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = vfsuid_into_kuid(vfsuid); } if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = vfsgid_into_kgid(vfsgid); } } /* * Compute brpm->cred based upon the final binary. */ static int bprm_creds_from_file(struct linux_binprm *bprm) { /* Compute creds based on which file? */ struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file; bprm_fill_uid(bprm, file); return security_bprm_creds_from_file(bprm, file); } /* * Fill the binprm structure from the inode. * Read the first BINPRM_BUF_SIZE bytes * * This may be called multiple times for binary chains (scripts for example). */ static int prepare_binprm(struct linux_binprm *bprm) { loff_t pos = 0; memset(bprm->buf, 0, BINPRM_BUF_SIZE); return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); } /* * Arguments are '\0' separated strings found at the location bprm->p * points to; chop off the first by relocating brpm->p to right after * the first '\0' encountered. */ int remove_arg_zero(struct linux_binprm *bprm) { int ret = 0; unsigned long offset; char *kaddr; struct page *page; if (!bprm->argc) return 0; do { offset = bprm->p & ~PAGE_MASK; page = get_arg_page(bprm, bprm->p, 0); if (!page) { ret = -EFAULT; goto out; } kaddr = kmap_local_page(page); for (; offset < PAGE_SIZE && kaddr[offset]; offset++, bprm->p++) ; kunmap_local(kaddr); put_arg_page(page); } while (offset == PAGE_SIZE); bprm->p++; bprm->argc--; ret = 0; out: return ret; } EXPORT_SYMBOL(remove_arg_zero); #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) /* * cycle the list of binary formats handler, until one recognizes the image */ static int search_binary_handler(struct linux_binprm *bprm) { bool need_retry = IS_ENABLED(CONFIG_MODULES); struct linux_binfmt *fmt; int retval; retval = prepare_binprm(bprm); if (retval < 0) return retval; retval = security_bprm_check(bprm); if (retval) return retval; retval = -ENOENT; retry: read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); retval = fmt->load_binary(bprm); read_lock(&binfmt_lock); put_binfmt(fmt); if (bprm->point_of_no_return || (retval != -ENOEXEC)) { read_unlock(&binfmt_lock); return retval; } } read_unlock(&binfmt_lock); if (need_retry) { if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && printable(bprm->buf[2]) && printable(bprm->buf[3])) return retval; if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0) return retval; need_retry = false; goto retry; } return retval; } /* binfmt handlers will call back into begin_new_exec() on success. */ static int exec_binprm(struct linux_binprm *bprm) { pid_t old_pid, old_vpid; int ret, depth; /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; rcu_read_lock(); old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); rcu_read_unlock(); /* This allows 4 levels of binfmt rewrites before failing hard. */ for (depth = 0;; depth++) { struct file *exec; if (depth > 5) return -ELOOP; ret = search_binary_handler(bprm); if (ret < 0) return ret; if (!bprm->interpreter) break; exec = bprm->file; bprm->file = bprm->interpreter; bprm->interpreter = NULL; allow_write_access(exec); if (unlikely(bprm->have_execfd)) { if (bprm->executable) { fput(exec); return -ENOEXEC; } bprm->executable = exec; } else fput(exec); } audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); proc_exec_connector(current); return 0; } /* * sys_execve() executes a new program. */ static int bprm_execve(struct linux_binprm *bprm, int fd, struct filename *filename, int flags) { struct file *file; int retval; retval = prepare_bprm_creds(bprm); if (retval) return retval; /* * Check for unsafe execution states before exec_binprm(), which * will call back into begin_new_exec(), into bprm_creds_from_file(), * where setuid-ness is evaluated. */ check_unsafe_exec(bprm); current->in_execve = 1; sched_mm_cid_before_execve(current); file = do_open_execat(fd, filename, flags); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; sched_exec(); bprm->file = file; /* * Record that a name derived from an O_CLOEXEC fd will be * inaccessible after exec. This allows the code in exec to * choose to fail when the executable is not mmaped into the * interpreter and an open file descriptor is not passed to * the interpreter. This makes for a better user experience * than having the interpreter start and then immediately fail * when it finds the executable is inaccessible. */ if (bprm->fdpath && get_close_on_exec(fd)) bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; /* Set the unchanging part of bprm->cred */ retval = security_bprm_creds_for_exec(bprm); if (retval) goto out; retval = exec_binprm(bprm); if (retval < 0) goto out; sched_mm_cid_after_execve(current); /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); user_events_execve(current); acct_update_integrals(current); task_numa_free(current, false); return retval; out: /* * If past the point of no return ensure the code never * returns to the userspace process. Use an existing fatal * signal if present otherwise terminate the process with * SIGSEGV. */ if (bprm->point_of_no_return && !fatal_signal_pending(current)) force_fatal_sig(SIGSEGV); out_unmark: sched_mm_cid_after_execve(current); current->fs->in_exec = 0; current->in_execve = 0; return retval; } static int do_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, int flags) { struct linux_binprm *bprm; int retval; if (IS_ERR(filename)) return PTR_ERR(filename); /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs * don't check setuid() return code. Here we additionally recheck * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } /* We're below the limit (still or again), so we don't want to make * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; bprm = alloc_bprm(fd, filename); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count(argv, MAX_ARG_STRINGS); if (retval == 0) pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", current->comm, bprm->filename); if (retval < 0) goto out_free; bprm->argc = retval; retval = count(envp, MAX_ARG_STRINGS); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings(bprm->argc, argv, bprm); if (retval < 0) goto out_free; /* * When argv is empty, add an empty string ("") as argv[0] to * ensure confused userspace programs that start processing * from argv[1] won't end up walking envp. See also * bprm_stack_limits(). */ if (bprm->argc == 0) { retval = copy_string_kernel("", bprm); if (retval < 0) goto out_free; bprm->argc = 1; } retval = bprm_execve(bprm, fd, filename, flags); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } int kernel_execve(const char *kernel_filename, const char *const *argv, const char *const *envp) { struct filename *filename; struct linux_binprm *bprm; int fd = AT_FDCWD; int retval; /* It is non-sense for kernel threads to call execve */ if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) return -EINVAL; filename = getname_kernel(kernel_filename); if (IS_ERR(filename)) return PTR_ERR(filename); bprm = alloc_bprm(fd, filename); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count_strings_kernel(argv); if (WARN_ON_ONCE(retval == 0)) retval = -EINVAL; if (retval < 0) goto out_free; bprm->argc = retval; retval = count_strings_kernel(envp); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings_kernel(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings_kernel(bprm->argc, argv, bprm); if (retval < 0) goto out_free; retval = bprm_execve(bprm, fd, filename, 0); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } static int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int do_execveat(int fd, struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp, int flags) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(fd, filename, argv, envp, flags); } #ifdef CONFIG_COMPAT static int compat_do_execve(struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int compat_do_execveat(int fd, struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp, int flags) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(fd, filename, argv, envp, flags); } #endif void set_binfmt(struct linux_binfmt *new) { struct mm_struct *mm = current->mm; if (mm->binfmt) module_put(mm->binfmt->module); mm->binfmt = new; if (new) __module_get(new->module); } EXPORT_SYMBOL(set_binfmt); /* * set_dumpable stores three-value SUID_DUMP_* into mm->flags. */ void set_dumpable(struct mm_struct *mm, int value) { if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); } SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp) { return do_execve(getname(filename), argv, envp); } SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp, int, flags) { return do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp) { return compat_do_execve(getname(filename), argv, envp); } COMPAT_SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp, int, flags) { return compat_do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #endif #ifdef CONFIG_SYSCTL static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!error) validate_coredump_safety(); return error; } static struct ctl_table fs_exec_sysctls[] = { { .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_coredump, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { } }; static int __init init_fs_exec_sysctls(void) { register_sysctl_init("fs", fs_exec_sysctls); return 0; } fs_initcall(init_fs_exec_sysctls); #endif /* CONFIG_SYSCTL */
53 2408 2408 94 185 5824 440 2 18 1286 18 1286 1298 1224 1187 1130 159 991 402 167 1 1662 138 639 26 1587 148 3016 3002 573 1589 1426 529 2801 1306 2 659 571 304 1188 1270 147 431 28 31 36 21 21 1 1416 1400 5 1 1 2673 875 2672 875 1 2402 4557 13 167 221 248 1250 298 1111 68 1113 1106 3429 1263 1787 2207 2207 5488 5487 1270 59 59 34 68 68 34 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM ext4 #if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_EXT4_H #include <linux/writeback.h> #include <linux/tracepoint.h> struct ext4_allocation_context; struct ext4_allocation_request; struct ext4_extent; struct ext4_prealloc_space; struct ext4_inode_info; struct mpage_da_data; struct ext4_map_blocks; struct extent_status; struct ext4_fsmap; struct partial_cluster; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) #define show_mballoc_flags(flags) __print_flags(flags, "|", \ { EXT4_MB_HINT_MERGE, "HINT_MERGE" }, \ { EXT4_MB_HINT_RESERVED, "HINT_RESV" }, \ { EXT4_MB_HINT_METADATA, "HINT_MDATA" }, \ { EXT4_MB_HINT_FIRST, "HINT_FIRST" }, \ { EXT4_MB_HINT_BEST, "HINT_BEST" }, \ { EXT4_MB_HINT_DATA, "HINT_DATA" }, \ { EXT4_MB_HINT_NOPREALLOC, "HINT_NOPREALLOC" }, \ { EXT4_MB_HINT_GROUP_ALLOC, "HINT_GRP_ALLOC" }, \ { EXT4_MB_HINT_GOAL_ONLY, "HINT_GOAL_ONLY" }, \ { EXT4_MB_HINT_TRY_GOAL, "HINT_TRY_GOAL" }, \ { EXT4_MB_DELALLOC_RESERVED, "DELALLOC_RESV" }, \ { EXT4_MB_STREAM_ALLOC, "STREAM_ALLOC" }, \ { EXT4_MB_USE_ROOT_BLOCKS, "USE_ROOT_BLKS" }, \ { EXT4_MB_USE_RESERVED, "USE_RESV" }, \ { EXT4_MB_STRICT_CHECK, "STRICT_CHECK" }) #define show_map_flags(flags) __print_flags(flags, "|", \ { EXT4_GET_BLOCKS_CREATE, "CREATE" }, \ { EXT4_GET_BLOCKS_UNWRIT_EXT, "UNWRIT" }, \ { EXT4_GET_BLOCKS_DELALLOC_RESERVE, "DELALLOC" }, \ { EXT4_GET_BLOCKS_PRE_IO, "PRE_IO" }, \ { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \ { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ { EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, "CONVERT_UNWRITTEN" }, \ { EXT4_GET_BLOCKS_ZERO, "ZERO" }, \ { EXT4_GET_BLOCKS_IO_SUBMIT, "IO_SUBMIT" }, \ { EXT4_EX_NOCACHE, "EX_NOCACHE" }) /* * __print_flags() requires that all enum values be wrapped in the * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace * ring buffer. */ TRACE_DEFINE_ENUM(BH_New); TRACE_DEFINE_ENUM(BH_Mapped); TRACE_DEFINE_ENUM(BH_Unwritten); TRACE_DEFINE_ENUM(BH_Boundary); #define show_mflags(flags) __print_flags(flags, "", \ { EXT4_MAP_NEW, "N" }, \ { EXT4_MAP_MAPPED, "M" }, \ { EXT4_MAP_UNWRITTEN, "U" }, \ { EXT4_MAP_BOUNDARY, "B" }) #define show_free_flags(flags) __print_flags(flags, "|", \ { EXT4_FREE_BLOCKS_METADATA, "METADATA" }, \ { EXT4_FREE_BLOCKS_FORGET, "FORGET" }, \ { EXT4_FREE_BLOCKS_VALIDATED, "VALIDATED" }, \ { EXT4_FREE_BLOCKS_NO_QUOT_UPDATE, "NO_QUOTA" }, \ { EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER,"1ST_CLUSTER" },\ { EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER, "LAST_CLUSTER" }) TRACE_DEFINE_ENUM(ES_WRITTEN_B); TRACE_DEFINE_ENUM(ES_UNWRITTEN_B); TRACE_DEFINE_ENUM(ES_DELAYED_B); TRACE_DEFINE_ENUM(ES_HOLE_B); TRACE_DEFINE_ENUM(ES_REFERENCED_B); #define show_extent_status(status) __print_flags(status, "", \ { EXTENT_STATUS_WRITTEN, "W" }, \ { EXTENT_STATUS_UNWRITTEN, "U" }, \ { EXTENT_STATUS_DELAYED, "D" }, \ { EXTENT_STATUS_HOLE, "H" }, \ { EXTENT_STATUS_REFERENCED, "R" }) #define show_falloc_mode(mode) __print_flags(mode, "|", \ { FALLOC_FL_KEEP_SIZE, "KEEP_SIZE"}, \ { FALLOC_FL_PUNCH_HOLE, "PUNCH_HOLE"}, \ { FALLOC_FL_NO_HIDE_STALE, "NO_HIDE_STALE"}, \ { FALLOC_FL_COLLAPSE_RANGE, "COLLAPSE_RANGE"}, \ { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"}) TRACE_DEFINE_ENUM(EXT4_FC_REASON_XATTR); TRACE_DEFINE_ENUM(EXT4_FC_REASON_CROSS_RENAME); TRACE_DEFINE_ENUM(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE); TRACE_DEFINE_ENUM(EXT4_FC_REASON_NOMEM); TRACE_DEFINE_ENUM(EXT4_FC_REASON_SWAP_BOOT); TRACE_DEFINE_ENUM(EXT4_FC_REASON_RESIZE); TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR); TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE); TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA); TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME); TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); #define show_fc_reason(reason) \ __print_symbolic(reason, \ { EXT4_FC_REASON_XATTR, "XATTR"}, \ { EXT4_FC_REASON_CROSS_RENAME, "CROSS_RENAME"}, \ { EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, "JOURNAL_FLAG_CHANGE"}, \ { EXT4_FC_REASON_NOMEM, "NO_MEM"}, \ { EXT4_FC_REASON_SWAP_BOOT, "SWAP_BOOT"}, \ { EXT4_FC_REASON_RESIZE, "RESIZE"}, \ { EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \ { EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}, \ { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \ { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}) TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED); TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST); TRACE_DEFINE_ENUM(CR_BEST_AVAIL_LEN); TRACE_DEFINE_ENUM(CR_GOAL_LEN_SLOW); TRACE_DEFINE_ENUM(CR_ANY_FREE); #define show_criteria(cr) \ __print_symbolic(cr, \ { CR_POWER2_ALIGNED, "CR_POWER2_ALIGNED" }, \ { CR_GOAL_LEN_FAST, "CR_GOAL_LEN_FAST" }, \ { CR_BEST_AVAIL_LEN, "CR_BEST_AVAIL_LEN" }, \ { CR_GOAL_LEN_SLOW, "CR_GOAL_LEN_SLOW" }, \ { CR_ANY_FREE, "CR_ANY_FREE" }) TRACE_EVENT(ext4_other_inode_update_time, TP_PROTO(struct inode *inode, ino_t orig_ino), TP_ARGS(inode, orig_ino), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, orig_ino ) __field( uid_t, uid ) __field( gid_t, gid ) __field( __u16, mode ) ), TP_fast_assign( __entry->orig_ino = orig_ino; __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->uid = i_uid_read(inode); __entry->gid = i_gid_read(inode); __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->orig_ino, (unsigned long) __entry->ino, __entry->mode, __entry->uid, __entry->gid) ); TRACE_EVENT(ext4_free_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( uid_t, uid ) __field( gid_t, gid ) __field( __u64, blocks ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->uid = i_uid_read(inode); __entry->gid = i_gid_read(inode); __entry->blocks = inode->i_blocks; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->uid, __entry->gid, __entry->blocks) ); TRACE_EVENT(ext4_request_inode, TP_PROTO(struct inode *dir, int mode), TP_ARGS(dir, mode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, dir ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = dir->i_ino; __entry->mode = mode; ), TP_printk("dev %d,%d dir %lu mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->dir, __entry->mode) ); TRACE_EVENT(ext4_allocate_inode, TP_PROTO(struct inode *inode, struct inode *dir, int mode), TP_ARGS(inode, dir, mode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, dir ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->dir = dir->i_ino; __entry->mode = mode; ), TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->dir, __entry->mode) ); TRACE_EVENT(ext4_evict_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, nlink ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nlink = inode->i_nlink; ), TP_printk("dev %d,%d ino %lu nlink %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->nlink) ); TRACE_EVENT(ext4_drop_inode, TP_PROTO(struct inode *inode, int drop), TP_ARGS(inode, drop), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, drop ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->drop = drop; ), TP_printk("dev %d,%d ino %lu drop %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->drop) ); TRACE_EVENT(ext4_nfs_commit_metadata, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; ), TP_printk("dev %d,%d ino %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino) ); TRACE_EVENT(ext4_mark_inode_dirty, TP_PROTO(struct inode *inode, unsigned long IP), TP_ARGS(inode, IP), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field(unsigned long, ip ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ip = IP; ), TP_printk("dev %d,%d ino %lu caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (void *)__entry->ip) ); TRACE_EVENT(ext4_begin_ordered_truncate, TP_PROTO(struct inode *inode, loff_t new_size), TP_ARGS(inode, new_size), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, new_size ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->new_size = new_size; ), TP_printk("dev %d,%d ino %lu new_size %lld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->new_size) ); DECLARE_EVENT_CLASS(ext4__write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), TP_ARGS(inode, pos, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; ), TP_printk("dev %d,%d ino %lu pos %lld len %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->len) ); DEFINE_EVENT(ext4__write_begin, ext4_write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), TP_ARGS(inode, pos, len) ); DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len), TP_ARGS(inode, pos, len) ); DECLARE_EVENT_CLASS(ext4__write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, len ) __field( unsigned int, copied ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; __entry->copied = copied; ), TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->len, __entry->copied) ); DEFINE_EVENT(ext4__write_end, ext4_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied) ); DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied) ); DEFINE_EVENT(ext4__write_end, ext4_da_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied) ); TRACE_EVENT(ext4_writepages, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( long, nr_to_write ) __field( long, pages_skipped ) __field( loff_t, range_start ) __field( loff_t, range_end ) __field( pgoff_t, writeback_index ) __field( int, sync_mode ) __field( char, for_kupdate ) __field( char, range_cyclic ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; __entry->range_end = wbc->range_end; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->range_cyclic = wbc->range_cyclic; ), TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld " "range_start %lld range_end %lld sync_mode %d " "for_kupdate %d range_cyclic %d writeback_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, (unsigned long) __entry->writeback_index) ); TRACE_EVENT(ext4_da_write_pages, TP_PROTO(struct inode *inode, pgoff_t first_page, struct writeback_control *wbc), TP_ARGS(inode, first_page, wbc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( pgoff_t, first_page ) __field( long, nr_to_write ) __field( int, sync_mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->first_page = first_page; __entry->nr_to_write = wbc->nr_to_write; __entry->sync_mode = wbc->sync_mode; ), TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld " "sync_mode %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->first_page, __entry->nr_to_write, __entry->sync_mode) ); TRACE_EVENT(ext4_da_write_pages_extent, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map), TP_ARGS(inode, map), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, lblk ) __field( __u32, len ) __field( __u32, flags ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = map->m_lblk; __entry->len = map->m_len; __entry->flags = map->m_flags; ), TP_printk("dev %d,%d ino %lu lblk %llu len %u flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, show_mflags(__entry->flags)) ); TRACE_EVENT(ext4_writepages_result, TP_PROTO(struct inode *inode, struct writeback_control *wbc, int ret, int pages_written), TP_ARGS(inode, wbc, ret, pages_written), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, ret ) __field( int, pages_written ) __field( long, pages_skipped ) __field( pgoff_t, writeback_index ) __field( int, sync_mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ret = ret; __entry->pages_written = pages_written; __entry->pages_skipped = wbc->pages_skipped; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->sync_mode = wbc->sync_mode; ), TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " "sync_mode %d writeback_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, __entry->sync_mode, (unsigned long) __entry->writeback_index) ); DECLARE_EVENT_CLASS(ext4__folio_op, TP_PROTO(struct inode *inode, struct folio *folio), TP_ARGS(inode, folio), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( pgoff_t, index ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->index = folio->index; ), TP_printk("dev %d,%d ino %lu folio_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->index) ); DEFINE_EVENT(ext4__folio_op, ext4_read_folio, TP_PROTO(struct inode *inode, struct folio *folio), TP_ARGS(inode, folio) ); DEFINE_EVENT(ext4__folio_op, ext4_release_folio, TP_PROTO(struct inode *inode, struct folio *folio), TP_ARGS(inode, folio) ); DECLARE_EVENT_CLASS(ext4_invalidate_folio_op, TP_PROTO(struct folio *folio, size_t offset, size_t length), TP_ARGS(folio, offset, length), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( pgoff_t, index ) __field( size_t, offset ) __field( size_t, length ) ), TP_fast_assign( __entry->dev = folio->mapping->host->i_sb->s_dev; __entry->ino = folio->mapping->host->i_ino; __entry->index = folio->index; __entry->offset = offset; __entry->length = length; ), TP_printk("dev %d,%d ino %lu folio_index %lu offset %zu length %zu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->index, __entry->offset, __entry->length) ); DEFINE_EVENT(ext4_invalidate_folio_op, ext4_invalidate_folio, TP_PROTO(struct folio *folio, size_t offset, size_t length), TP_ARGS(folio, offset, length) ); DEFINE_EVENT(ext4_invalidate_folio_op, ext4_journalled_invalidate_folio, TP_PROTO(struct folio *folio, size_t offset, size_t length), TP_ARGS(folio, offset, length) ); TRACE_EVENT(ext4_discard_blocks, TP_PROTO(struct super_block *sb, unsigned long long blk, unsigned long long count), TP_ARGS(sb, blk, count), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u64, blk ) __field( __u64, count ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->blk = blk; __entry->count = count; ), TP_printk("dev %d,%d blk %llu count %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blk, __entry->count) ); DECLARE_EVENT_CLASS(ext4__mb_new_pa, TP_PROTO(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa), TP_ARGS(ac, pa), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, pa_pstart ) __field( __u64, pa_lstart ) __field( __u32, pa_len ) ), TP_fast_assign( __entry->dev = ac->ac_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->pa_pstart = pa->pa_pstart; __entry->pa_lstart = pa->pa_lstart; __entry->pa_len = pa->pa_len; ), TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart) ); DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa, TP_PROTO(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa), TP_ARGS(ac, pa) ); DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa, TP_PROTO(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa), TP_ARGS(ac, pa) ); TRACE_EVENT(ext4_mb_release_inode_pa, TP_PROTO(struct ext4_prealloc_space *pa, unsigned long long block, unsigned int count), TP_ARGS(pa, block, count), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( __u32, count ) ), TP_fast_assign( __entry->dev = pa->pa_inode->i_sb->s_dev; __entry->ino = pa->pa_inode->i_ino; __entry->block = block; __entry->count = count; ), TP_printk("dev %d,%d ino %lu block %llu count %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->block, __entry->count) ); TRACE_EVENT(ext4_mb_release_group_pa, TP_PROTO(struct super_block *sb, struct ext4_prealloc_space *pa), TP_ARGS(sb, pa), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u64, pa_pstart ) __field( __u32, pa_len ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; ), TP_printk("dev %d,%d pstart %llu len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->pa_pstart, __entry->pa_len) ); TRACE_EVENT(ext4_discard_preallocations, TP_PROTO(struct inode *inode, unsigned int len, unsigned int needed), TP_ARGS(inode, len, needed), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, len ) __field( unsigned int, needed ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->len = len; __entry->needed = needed; ), TP_printk("dev %d,%d ino %lu len: %u needed %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->len, __entry->needed) ); TRACE_EVENT(ext4_mb_discard_preallocations, TP_PROTO(struct super_block *sb, int needed), TP_ARGS(sb, needed), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, needed ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->needed = needed; ), TP_printk("dev %d,%d needed %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->needed) ); TRACE_EVENT(ext4_request_blocks, TP_PROTO(struct ext4_allocation_request *ar), TP_ARGS(ar), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, len ) __field( __u32, logical ) __field( __u32, lleft ) __field( __u32, lright ) __field( __u64, goal ) __field( __u64, pleft ) __field( __u64, pright ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = ar->inode->i_sb->s_dev; __entry->ino = ar->inode->i_ino; __entry->len = ar->len; __entry->logical = ar->logical; __entry->goal = ar->goal; __entry->lleft = ar->lleft; __entry->lright = ar->lright; __entry->pleft = ar->pleft; __entry->pright = ar->pright; __entry->flags = ar->flags; ), TP_printk("dev %d,%d ino %lu flags %s len %u lblk %u goal %llu " "lleft %u lright %u pleft %llu pright %llu ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags), __entry->len, __entry->logical, __entry->goal, __entry->lleft, __entry->lright, __entry->pleft, __entry->pright) ); TRACE_EVENT(ext4_allocate_blocks, TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block), TP_ARGS(ar, block), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( unsigned int, len ) __field( __u32, logical ) __field( __u32, lleft ) __field( __u32, lright ) __field( __u64, goal ) __field( __u64, pleft ) __field( __u64, pright ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = ar->inode->i_sb->s_dev; __entry->ino = ar->inode->i_ino; __entry->block = block; __entry->len = ar->len; __entry->logical = ar->logical; __entry->goal = ar->goal; __entry->lleft = ar->lleft; __entry->lright = ar->lright; __entry->pleft = ar->pleft; __entry->pright = ar->pright; __entry->flags = ar->flags; ), TP_printk("dev %d,%d ino %lu flags %s len %u block %llu lblk %u " "goal %llu lleft %u lright %u pleft %llu pright %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags), __entry->len, __entry->block, __entry->logical, __entry->goal, __entry->lleft, __entry->lright, __entry->pleft, __entry->pright) ); TRACE_EVENT(ext4_free_blocks, TP_PROTO(struct inode *inode, __u64 block, unsigned long count, int flags), TP_ARGS(inode, block, count, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( unsigned long, count ) __field( int, flags ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->block = block; __entry->count = count; __entry->flags = flags; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->block, __entry->count, show_free_flags(__entry->flags)) ); TRACE_EVENT(ext4_sync_file_enter, TP_PROTO(struct file *file, int datasync), TP_ARGS(file, datasync), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, parent ) __field( int, datasync ) ), TP_fast_assign( struct dentry *dentry = file->f_path.dentry; __entry->dev = dentry->d_sb->s_dev; __entry->ino = d_inode(dentry)->i_ino; __entry->datasync = datasync; __entry->parent = d_inode(dentry->d_parent)->i_ino; ), TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->parent, __entry->datasync) ); TRACE_EVENT(ext4_sync_file_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret) ); TRACE_EVENT(ext4_sync_fs, TP_PROTO(struct super_block *sb, int wait), TP_ARGS(sb, wait), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, wait ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->wait = wait; ), TP_printk("dev %d,%d wait %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->wait) ); TRACE_EVENT(ext4_alloc_da_blocks, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, data_blocks ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks; ), TP_printk("dev %d,%d ino %lu reserved_data_blocks %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->data_blocks) ); TRACE_EVENT(ext4_mballoc_alloc, TP_PROTO(struct ext4_allocation_context *ac), TP_ARGS(ac), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u32, orig_logical ) __field( int, orig_start ) __field( __u32, orig_group ) __field( int, orig_len ) __field( __u32, goal_logical ) __field( int, goal_start ) __field( __u32, goal_group ) __field( int, goal_len ) __field( __u32, result_logical ) __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) __field( __u16, found ) __field( __u16, groups ) __field( __u16, buddy ) __field( __u16, flags ) __field( __u16, tail ) __field( __u8, cr ) ), TP_fast_assign( __entry->dev = ac->ac_inode->i_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->orig_logical = ac->ac_o_ex.fe_logical; __entry->orig_start = ac->ac_o_ex.fe_start; __entry->orig_group = ac->ac_o_ex.fe_group; __entry->orig_len = ac->ac_o_ex.fe_len; __entry->goal_logical = ac->ac_g_ex.fe_logical; __entry->goal_start = ac->ac_g_ex.fe_start; __entry->goal_group = ac->ac_g_ex.fe_group; __entry->goal_len = ac->ac_g_ex.fe_len; __entry->result_logical = ac->ac_f_ex.fe_logical; __entry->result_start = ac->ac_f_ex.fe_start; __entry->result_group = ac->ac_f_ex.fe_group; __entry->result_len = ac->ac_f_ex.fe_len; __entry->found = ac->ac_found; __entry->flags = ac->ac_flags; __entry->groups = ac->ac_groups_scanned; __entry->buddy = ac->ac_buddy; __entry->tail = ac->ac_tail; __entry->cr = ac->ac_criteria; ), TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " "result %u/%d/%u@%u blks %u grps %u cr %s flags %s " "tail %u broken %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, __entry->goal_group, __entry->goal_start, __entry->goal_len, __entry->goal_logical, __entry->result_group, __entry->result_start, __entry->result_len, __entry->result_logical, __entry->found, __entry->groups, show_criteria(__entry->cr), show_mballoc_flags(__entry->flags), __entry->tail, __entry->buddy ? 1 << __entry->buddy : 0) ); TRACE_EVENT(ext4_mballoc_prealloc, TP_PROTO(struct ext4_allocation_context *ac), TP_ARGS(ac), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u32, orig_logical ) __field( int, orig_start ) __field( __u32, orig_group ) __field( int, orig_len ) __field( __u32, result_logical ) __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) ), TP_fast_assign( __entry->dev = ac->ac_inode->i_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->orig_logical = ac->ac_o_ex.fe_logical; __entry->orig_start = ac->ac_o_ex.fe_start; __entry->orig_group = ac->ac_o_ex.fe_group; __entry->orig_len = ac->ac_o_ex.fe_len; __entry->result_logical = ac->ac_b_ex.fe_logical; __entry->result_start = ac->ac_b_ex.fe_start; __entry->result_group = ac->ac_b_ex.fe_group; __entry->result_len = ac->ac_b_ex.fe_len; ), TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, __entry->result_group, __entry->result_start, __entry->result_len, __entry->result_logical) ); DECLARE_EVENT_CLASS(ext4__mballoc, TP_PROTO(struct super_block *sb, struct inode *inode, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, inode, group, start, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ino = inode ? inode->i_ino : 0; __entry->result_start = start; __entry->result_group = group; __entry->result_len = len; ), TP_printk("dev %d,%d inode %lu extent %u/%d/%d ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->result_group, __entry->result_start, __entry->result_len) ); DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard, TP_PROTO(struct super_block *sb, struct inode *inode, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, inode, group, start, len) ); DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free, TP_PROTO(struct super_block *sb, struct inode *inode, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, inode, group, start, len) ); TRACE_EVENT(ext4_forget, TP_PROTO(struct inode *inode, int is_metadata, __u64 block), TP_ARGS(inode, is_metadata, block), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( int, is_metadata ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->block = block; __entry->is_metadata = is_metadata; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->is_metadata, __entry->block) ); TRACE_EVENT(ext4_da_update_reserve_space, TP_PROTO(struct inode *inode, int used_blocks, int quota_claim), TP_ARGS(inode, used_blocks, quota_claim), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, i_blocks ) __field( int, used_blocks ) __field( int, reserved_data_blocks ) __field( int, quota_claim ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->i_blocks = inode->i_blocks; __entry->used_blocks = used_blocks; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->quota_claim = quota_claim; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d " "reserved_data_blocks %d quota_claim %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, __entry->used_blocks, __entry->reserved_data_blocks, __entry->quota_claim) ); TRACE_EVENT(ext4_da_reserve_space, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, i_blocks ) __field( int, reserved_data_blocks ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->i_blocks = inode->i_blocks; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu " "reserved_data_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, __entry->reserved_data_blocks) ); TRACE_EVENT(ext4_da_release_space, TP_PROTO(struct inode *inode, int freed_blocks), TP_ARGS(inode, freed_blocks), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, i_blocks ) __field( int, freed_blocks ) __field( int, reserved_data_blocks ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->i_blocks = inode->i_blocks; __entry->freed_blocks = freed_blocks; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d " "reserved_data_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, __entry->freed_blocks, __entry->reserved_data_blocks) ); DECLARE_EVENT_CLASS(ext4__bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; ), TP_printk("dev %d,%d group %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group) ); DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group) ); DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group) ); DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group) ); TRACE_EVENT(ext4_read_block_bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group, bool prefetch), TP_ARGS(sb, group, prefetch), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) __field( bool, prefetch ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; __entry->prefetch = prefetch; ), TP_printk("dev %d,%d group %u prefetch %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group, __entry->prefetch) ); DECLARE_EVENT_CLASS(ext4__fallocate_mode, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, offset ) __field( loff_t, len ) __field( int, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->offset = offset; __entry->len = len; __entry->mode = mode; ), TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->offset, __entry->len, show_falloc_mode(__entry->mode)) ); DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode) ); DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode) ); DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode) ); TRACE_EVENT(ext4_fallocate_exit, TP_PROTO(struct inode *inode, loff_t offset, unsigned int max_blocks, int ret), TP_ARGS(inode, offset, max_blocks, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, blocks ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = offset; __entry->blocks = max_blocks; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->blocks, __entry->ret) ); TRACE_EVENT(ext4_unlink_enter, TP_PROTO(struct inode *parent, struct dentry *dentry), TP_ARGS(parent, dentry), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, parent ) __field( loff_t, size ) ), TP_fast_assign( __entry->dev = dentry->d_sb->s_dev; __entry->ino = d_inode(dentry)->i_ino; __entry->parent = parent->i_ino; __entry->size = d_inode(dentry)->i_size; ), TP_printk("dev %d,%d ino %lu size %lld parent %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->size, (unsigned long) __entry->parent) ); TRACE_EVENT(ext4_unlink_exit, TP_PROTO(struct dentry *dentry, int ret), TP_ARGS(dentry, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, ret ) ), TP_fast_assign( __entry->dev = dentry->d_sb->s_dev; __entry->ino = d_inode(dentry)->i_ino; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret) ); DECLARE_EVENT_CLASS(ext4__truncate, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, blocks ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->blocks = inode->i_blocks; ), TP_printk("dev %d,%d ino %lu blocks %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->blocks) ); DEFINE_EVENT(ext4__truncate, ext4_truncate_enter, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(ext4__truncate, ext4_truncate_exit, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); /* 'ux' is the unwritten extent. */ TRACE_EVENT(ext4_ext_convert_to_initialized_enter, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, struct ext4_extent *ux), TP_ARGS(inode, map, ux), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, m_lblk ) __field( unsigned, m_len ) __field( ext4_lblk_t, u_lblk ) __field( unsigned, u_len ) __field( ext4_fsblk_t, u_pblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_len = map->m_len; __entry->u_lblk = le32_to_cpu(ux->ee_block); __entry->u_len = ext4_ext_get_actual_len(ux); __entry->u_pblk = ext4_ext_pblock(ux); ), TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u " "u_pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->m_lblk, __entry->m_len, __entry->u_lblk, __entry->u_len, __entry->u_pblk) ); /* * 'ux' is the unwritten extent. * 'ix' is the initialized extent to which blocks are transferred. */ TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, struct ext4_extent *ux, struct ext4_extent *ix), TP_ARGS(inode, map, ux, ix), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, m_lblk ) __field( unsigned, m_len ) __field( ext4_lblk_t, u_lblk ) __field( unsigned, u_len ) __field( ext4_fsblk_t, u_pblk ) __field( ext4_lblk_t, i_lblk ) __field( unsigned, i_len ) __field( ext4_fsblk_t, i_pblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_len = map->m_len; __entry->u_lblk = le32_to_cpu(ux->ee_block); __entry->u_len = ext4_ext_get_actual_len(ux); __entry->u_pblk = ext4_ext_pblock(ux); __entry->i_lblk = le32_to_cpu(ix->ee_block); __entry->i_len = ext4_ext_get_actual_len(ix); __entry->i_pblk = ext4_ext_pblock(ix); ), TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u " "u_lblk %u u_len %u u_pblk %llu " "i_lblk %u i_len %u i_pblk %llu ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->m_lblk, __entry->m_len, __entry->u_lblk, __entry->u_len, __entry->u_pblk, __entry->i_lblk, __entry->i_len, __entry->i_pblk) ); DECLARE_EVENT_CLASS(ext4__map_blocks_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len, unsigned int flags), TP_ARGS(inode, lblk, len, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; __entry->flags = flags; ), TP_printk("dev %d,%d ino %lu lblk %u len %u flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, show_map_flags(__entry->flags)) ); DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned len, unsigned flags), TP_ARGS(inode, lblk, len, flags) ); DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned len, unsigned flags), TP_ARGS(inode, lblk, len, flags) ); DECLARE_EVENT_CLASS(ext4__map_blocks_exit, TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map, int ret), TP_ARGS(inode, flags, map, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, flags ) __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) __field( unsigned int, mflags ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->flags = flags; __entry->pblk = map->m_pblk; __entry->lblk = map->m_lblk; __entry->len = map->m_len; __entry->mflags = map->m_flags; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu flags %s lblk %u pblk %llu len %u " "mflags %s ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, show_map_flags(__entry->flags), __entry->lblk, __entry->pblk, __entry->len, show_mflags(__entry->mflags), __entry->ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map, int ret), TP_ARGS(inode, flags, map, ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map, int ret), TP_ARGS(inode, flags, map, ret) ); TRACE_EVENT(ext4_ext_load_extent, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk), TP_ARGS(inode, lblk, pblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pblk = pblk; __entry->lblk = lblk; ), TP_printk("dev %d,%d ino %lu lblk %u pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->pblk) ); TRACE_EVENT(ext4_load_inode, TP_PROTO(struct super_block *sb, unsigned long ino), TP_ARGS(sb, ino), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ino = ino; ), TP_printk("dev %d,%d ino %ld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino) ); TRACE_EVENT(ext4_journal_start_sb, TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks, int revoke_creds, int type, unsigned long IP), TP_ARGS(sb, blocks, rsv_blocks, revoke_creds, type, IP), TP_STRUCT__entry( __field( dev_t, dev ) __field( unsigned long, ip ) __field( int, blocks ) __field( int, rsv_blocks ) __field( int, revoke_creds ) __field( int, type ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ip = IP; __entry->blocks = blocks; __entry->rsv_blocks = rsv_blocks; __entry->revoke_creds = revoke_creds; __entry->type = type; ), TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d," " type %d, caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks, __entry->revoke_creds, __entry->type, (void *)__entry->ip) ); TRACE_EVENT(ext4_journal_start_inode, TP_PROTO(struct inode *inode, int blocks, int rsv_blocks, int revoke_creds, int type, unsigned long IP), TP_ARGS(inode, blocks, rsv_blocks, revoke_creds, type, IP), TP_STRUCT__entry( __field( unsigned long, ino ) __field( dev_t, dev ) __field( unsigned long, ip ) __field( int, blocks ) __field( int, rsv_blocks ) __field( int, revoke_creds ) __field( int, type ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ip = IP; __entry->blocks = blocks; __entry->rsv_blocks = rsv_blocks; __entry->revoke_creds = revoke_creds; __entry->type = type; __entry->ino = inode->i_ino; ), TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d," " type %d, ino %lu, caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks, __entry->revoke_creds, __entry->type, __entry->ino, (void *)__entry->ip) ); TRACE_EVENT(ext4_journal_start_reserved, TP_PROTO(struct super_block *sb, int blocks, unsigned long IP), TP_ARGS(sb, blocks, IP), TP_STRUCT__entry( __field( dev_t, dev ) __field(unsigned long, ip ) __field( int, blocks ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ip = IP; __entry->blocks = blocks; ), TP_printk("dev %d,%d blocks, %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, (void *)__entry->ip) ); DECLARE_EVENT_CLASS(ext4__trim, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, group, start, len), TP_STRUCT__entry( __field( int, dev_major ) __field( int, dev_minor ) __field( __u32, group ) __field( int, start ) __field( int, len ) ), TP_fast_assign( __entry->dev_major = MAJOR(sb->s_dev); __entry->dev_minor = MINOR(sb->s_dev); __entry->group = group; __entry->start = start; __entry->len = len; ), TP_printk("dev %d,%d group %u, start %d, len %d", __entry->dev_major, __entry->dev_minor, __entry->group, __entry->start, __entry->len) ); DEFINE_EVENT(ext4__trim, ext4_trim_extent, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, group, start, len) ); DEFINE_EVENT(ext4__trim, ext4_trim_all_free, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, group, start, len) ); TRACE_EVENT(ext4_ext_handle_unwritten_extents, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags, unsigned int allocated, ext4_fsblk_t newblock), TP_ARGS(inode, map, flags, allocated, newblock), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, flags ) __field( ext4_lblk_t, lblk ) __field( ext4_fsblk_t, pblk ) __field( unsigned int, len ) __field( unsigned int, allocated ) __field( ext4_fsblk_t, newblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->flags = flags; __entry->lblk = map->m_lblk; __entry->pblk = map->m_pblk; __entry->len = map->m_len; __entry->allocated = allocated; __entry->newblk = newblock; ), TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %s " "allocated %d newblock %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, __entry->len, show_map_flags(__entry->flags), (unsigned int) __entry->allocated, (unsigned long long) __entry->newblk) ); TRACE_EVENT(ext4_get_implied_cluster_alloc_exit, TP_PROTO(struct super_block *sb, struct ext4_map_blocks *map, int ret), TP_ARGS(sb, map, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( unsigned int, flags ) __field( ext4_lblk_t, lblk ) __field( ext4_fsblk_t, pblk ) __field( unsigned int, len ) __field( int, ret ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->flags = map->m_flags; __entry->lblk = map->m_lblk; __entry->pblk = map->m_pblk; __entry->len = map->m_len; __entry->ret = ret; ), TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %s ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->lblk, (unsigned long long) __entry->pblk, __entry->len, show_mflags(__entry->flags), __entry->ret) ); TRACE_EVENT(ext4_ext_show_extent, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, unsigned short len), TP_ARGS(inode, lblk, pblk, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) __field( unsigned short, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pblk = pblk; __entry->lblk = lblk; __entry->len = len; ), TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, (unsigned short) __entry->len) ); TRACE_EVENT(ext4_remove_blocks, TP_PROTO(struct inode *inode, struct ext4_extent *ex, ext4_lblk_t from, ext4_fsblk_t to, struct partial_cluster *pc), TP_ARGS(inode, ex, from, to, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, from ) __field( ext4_lblk_t, to ) __field( ext4_fsblk_t, ee_pblk ) __field( ext4_lblk_t, ee_lblk ) __field( unsigned short, ee_len ) __field( ext4_fsblk_t, pc_pclu ) __field( ext4_lblk_t, pc_lblk ) __field( int, pc_state) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->from = from; __entry->to = to; __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_len = ext4_ext_get_actual_len(ex); __entry->pc_pclu = pc->pclu; __entry->pc_lblk = pc->lblk; __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" "from %u to %u partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->ee_lblk, (unsigned long long) __entry->ee_pblk, (unsigned short) __entry->ee_len, (unsigned) __entry->from, (unsigned) __entry->to, (long long) __entry->pc_pclu, (unsigned int) __entry->pc_lblk, (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_leaf, TP_PROTO(struct inode *inode, ext4_lblk_t start, struct ext4_extent *ex, struct partial_cluster *pc), TP_ARGS(inode, start, ex, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, ee_lblk ) __field( ext4_fsblk_t, ee_pblk ) __field( short, ee_len ) __field( ext4_fsblk_t, pc_pclu ) __field( ext4_lblk_t, pc_lblk ) __field( int, pc_state) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start = start; __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_len = ext4_ext_get_actual_len(ex); __entry->pc_pclu = pc->pclu; __entry->pc_lblk = pc->lblk; __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" "partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->ee_lblk, (unsigned long long) __entry->ee_pblk, (unsigned short) __entry->ee_len, (long long) __entry->pc_pclu, (unsigned int) __entry->pc_lblk, (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_idx, TP_PROTO(struct inode *inode, ext4_fsblk_t pblk), TP_ARGS(inode, pblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pblk = pblk; ), TP_printk("dev %d,%d ino %lu index_pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long long) __entry->pblk) ); TRACE_EVENT(ext4_ext_remove_space, TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, int depth), TP_ARGS(inode, start, end, depth), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, end ) __field( int, depth ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start = start; __entry->end = end; __entry->depth = depth; ), TP_printk("dev %d,%d ino %lu since %u end %u depth %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->end, __entry->depth) ); TRACE_EVENT(ext4_ext_remove_space_done, TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, int depth, struct partial_cluster *pc, __le16 eh_entries), TP_ARGS(inode, start, end, depth, pc, eh_entries), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, end ) __field( int, depth ) __field( ext4_fsblk_t, pc_pclu ) __field( ext4_lblk_t, pc_lblk ) __field( int, pc_state ) __field( unsigned short, eh_entries ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start = start; __entry->end = end; __entry->depth = depth; __entry->pc_pclu = pc->pclu; __entry->pc_lblk = pc->lblk; __entry->pc_state = pc->state; __entry->eh_entries = le16_to_cpu(eh_entries); ), TP_printk("dev %d,%d ino %lu since %u end %u depth %d " "partial [pclu %lld lblk %u state %d] " "remaining_entries %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->end, __entry->depth, (long long) __entry->pc_pclu, (unsigned int) __entry->pc_lblk, (int) __entry->pc_state, (unsigned short) __entry->eh_entries) ); DECLARE_EVENT_CLASS(ext4__es_extent, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); ), TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status)) ); DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es) ); DEFINE_EVENT(ext4__es_extent, ext4_es_cache_extent, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es) ); TRACE_EVENT(ext4_es_remove_extent, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len), TP_ARGS(inode, lblk, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, lblk ) __field( loff_t, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; ), TP_printk("dev %d,%d ino %lu es [%lld/%lld)", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len) ); TRACE_EVENT(ext4_es_find_extent_range_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk), TP_ARGS(inode, lblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; ), TP_printk("dev %d,%d ino %lu lblk %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk) ); TRACE_EVENT(ext4_es_find_extent_range_exit, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); ), TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status)) ); TRACE_EVENT(ext4_es_lookup_extent_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk), TP_ARGS(inode, lblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; ), TP_printk("dev %d,%d ino %lu lblk %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk) ); TRACE_EVENT(ext4_es_lookup_extent_exit, TP_PROTO(struct inode *inode, struct extent_status *es, int found), TP_ARGS(inode, es, found), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) __field( int, found ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); __entry->found = found; ), TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->found, __entry->lblk, __entry->len, __entry->found ? __entry->pblk : 0, show_extent_status(__entry->found ? __entry->status : 0)) ); DECLARE_EVENT_CLASS(ext4__es_shrink_enter, TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), TP_ARGS(sb, nr_to_scan, cache_cnt), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, nr_to_scan ) __field( int, cache_cnt ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nr_to_scan = nr_to_scan; __entry->cache_cnt = cache_cnt; ), TP_printk("dev %d,%d nr_to_scan %d cache_cnt %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_to_scan, __entry->cache_cnt) ); DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_count, TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), TP_ARGS(sb, nr_to_scan, cache_cnt) ); DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_scan_enter, TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), TP_ARGS(sb, nr_to_scan, cache_cnt) ); TRACE_EVENT(ext4_es_shrink_scan_exit, TP_PROTO(struct super_block *sb, int nr_shrunk, int cache_cnt), TP_ARGS(sb, nr_shrunk, cache_cnt), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, nr_shrunk ) __field( int, cache_cnt ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nr_shrunk = nr_shrunk; __entry->cache_cnt = cache_cnt; ), TP_printk("dev %d,%d nr_shrunk %d cache_cnt %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk, __entry->cache_cnt) ); TRACE_EVENT(ext4_collapse_range, TP_PROTO(struct inode *inode, loff_t offset, loff_t len), TP_ARGS(inode, offset, len), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, offset) __field(loff_t, len) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->offset = offset; __entry->len = len; ), TP_printk("dev %d,%d ino %lu offset %lld len %lld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->offset, __entry->len) ); TRACE_EVENT(ext4_insert_range, TP_PROTO(struct inode *inode, loff_t offset, loff_t len), TP_ARGS(inode, offset, len), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, offset) __field(loff_t, len) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->offset = offset; __entry->len = len; ), TP_printk("dev %d,%d ino %lu offset %lld len %lld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->offset, __entry->len) ); TRACE_EVENT(ext4_es_shrink, TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time, int nr_skipped, int retried), TP_ARGS(sb, nr_shrunk, scan_time, nr_skipped, retried), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, nr_shrunk ) __field( unsigned long long, scan_time ) __field( int, nr_skipped ) __field( int, retried ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nr_shrunk = nr_shrunk; __entry->scan_time = div_u64(scan_time, 1000); __entry->nr_skipped = nr_skipped; __entry->retried = retried; ), TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu " "nr_skipped %d retried %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk, __entry->scan_time, __entry->nr_skipped, __entry->retried) ); TRACE_EVENT(ext4_es_insert_delayed_block, TP_PROTO(struct inode *inode, struct extent_status *es, bool allocated), TP_ARGS(inode, es, allocated), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) __field( bool, allocated ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); __entry->allocated = allocated; ), TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " "allocated %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status), __entry->allocated) ); /* fsmap traces */ DECLARE_EVENT_CLASS(ext4_fsmap_class, TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, u64 owner), TP_ARGS(sb, keydev, agno, bno, len, owner), TP_STRUCT__entry( __field(dev_t, dev) __field(dev_t, keydev) __field(u32, agno) __field(u64, bno) __field(u64, len) __field(u64, owner) ), TP_fast_assign( __entry->dev = sb->s_bdev->bd_dev; __entry->keydev = new_decode_dev(keydev); __entry->agno = agno; __entry->bno = bno; __entry->len = len; __entry->owner = owner; ), TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld\n", MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->agno, __entry->bno, __entry->len, __entry->owner) ) #define DEFINE_FSMAP_EVENT(name) \ DEFINE_EVENT(ext4_fsmap_class, name, \ TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, \ u64 owner), \ TP_ARGS(sb, keydev, agno, bno, len, owner)) DEFINE_FSMAP_EVENT(ext4_fsmap_low_key); DEFINE_FSMAP_EVENT(ext4_fsmap_high_key); DEFINE_FSMAP_EVENT(ext4_fsmap_mapping); DECLARE_EVENT_CLASS(ext4_getfsmap_class, TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), TP_ARGS(sb, fsmap), TP_STRUCT__entry( __field(dev_t, dev) __field(dev_t, keydev) __field(u64, block) __field(u64, len) __field(u64, owner) __field(u64, flags) ), TP_fast_assign( __entry->dev = sb->s_bdev->bd_dev; __entry->keydev = new_decode_dev(fsmap->fmr_device); __entry->block = fsmap->fmr_physical; __entry->len = fsmap->fmr_length; __entry->owner = fsmap->fmr_owner; __entry->flags = fsmap->fmr_flags; ), TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld flags 0x%llx\n", MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->block, __entry->len, __entry->owner, __entry->flags) ) #define DEFINE_GETFSMAP_EVENT(name) \ DEFINE_EVENT(ext4_getfsmap_class, name, \ TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), \ TP_ARGS(sb, fsmap)) DEFINE_GETFSMAP_EVENT(ext4_getfsmap_low_key); DEFINE_GETFSMAP_EVENT(ext4_getfsmap_high_key); DEFINE_GETFSMAP_EVENT(ext4_getfsmap_mapping); TRACE_EVENT(ext4_shutdown, TP_PROTO(struct super_block *sb, unsigned long flags), TP_ARGS(sb, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( unsigned, flags ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->flags = flags; ), TP_printk("dev %d,%d flags %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->flags) ); TRACE_EVENT(ext4_error, TP_PROTO(struct super_block *sb, const char *function, unsigned int line), TP_ARGS(sb, function, line), TP_STRUCT__entry( __field( dev_t, dev ) __field( const char *, function ) __field( unsigned, line ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->function = function; __entry->line = line; ), TP_printk("dev %d,%d function %s line %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->function, __entry->line) ); TRACE_EVENT(ext4_prefetch_bitmaps, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_group_t next, unsigned int prefetch_ios), TP_ARGS(sb, group, next, prefetch_ios), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) __field( __u32, next ) __field( __u32, ios ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; __entry->next = next; __entry->ios = prefetch_ios; ), TP_printk("dev %d,%d group %u next %u ios %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group, __entry->next, __entry->ios) ); TRACE_EVENT(ext4_lazy_itable_init, TP_PROTO(struct super_block *sb, ext4_group_t group), TP_ARGS(sb, group), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; ), TP_printk("dev %d,%d group %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group) ); TRACE_EVENT(ext4_fc_replay_scan, TP_PROTO(struct super_block *sb, int error, int off), TP_ARGS(sb, error, off), TP_STRUCT__entry( __field(dev_t, dev) __field(int, error) __field(int, off) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->error = error; __entry->off = off; ), TP_printk("dev %d,%d error %d, off %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->error, __entry->off) ); TRACE_EVENT(ext4_fc_replay, TP_PROTO(struct super_block *sb, int tag, int ino, int priv1, int priv2), TP_ARGS(sb, tag, ino, priv1, priv2), TP_STRUCT__entry( __field(dev_t, dev) __field(int, tag) __field(int, ino) __field(int, priv1) __field(int, priv2) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->tag = tag; __entry->ino = ino; __entry->priv1 = priv1; __entry->priv2 = priv2; ), TP_printk("dev %d,%d: tag %d, ino %d, data1 %d, data2 %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tag, __entry->ino, __entry->priv1, __entry->priv2) ); TRACE_EVENT(ext4_fc_commit_start, TP_PROTO(struct super_block *sb, tid_t commit_tid), TP_ARGS(sb, commit_tid), TP_STRUCT__entry( __field(dev_t, dev) __field(tid_t, tid) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->tid = commit_tid; ), TP_printk("dev %d,%d tid %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid) ); TRACE_EVENT(ext4_fc_commit_stop, TP_PROTO(struct super_block *sb, int nblks, int reason, tid_t commit_tid), TP_ARGS(sb, nblks, reason, commit_tid), TP_STRUCT__entry( __field(dev_t, dev) __field(int, nblks) __field(int, reason) __field(int, num_fc) __field(int, num_fc_ineligible) __field(int, nblks_agg) __field(tid_t, tid) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nblks = nblks; __entry->reason = reason; __entry->num_fc = EXT4_SB(sb)->s_fc_stats.fc_num_commits; __entry->num_fc_ineligible = EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits; __entry->nblks_agg = EXT4_SB(sb)->s_fc_stats.fc_numblks; __entry->tid = commit_tid; ), TP_printk("dev %d,%d nblks %d, reason %d, fc = %d, ineligible = %d, agg_nblks %d, tid %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nblks, __entry->reason, __entry->num_fc, __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid) ); #define FC_REASON_NAME_STAT(reason) \ show_fc_reason(reason), \ __entry->fc_ineligible_rc[reason] TRACE_EVENT(ext4_fc_stats, TP_PROTO(struct super_block *sb), TP_ARGS(sb), TP_STRUCT__entry( __field(dev_t, dev) __array(unsigned int, fc_ineligible_rc, EXT4_FC_REASON_MAX) __field(unsigned long, fc_commits) __field(unsigned long, fc_ineligible_commits) __field(unsigned long, fc_numblks) ), TP_fast_assign( int i; __entry->dev = sb->s_dev; for (i = 0; i < EXT4_FC_REASON_MAX; i++) { __entry->fc_ineligible_rc[i] = EXT4_SB(sb)->s_fc_stats.fc_ineligible_reason_count[i]; } __entry->fc_commits = EXT4_SB(sb)->s_fc_stats.fc_num_commits; __entry->fc_ineligible_commits = EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits; __entry->fc_numblks = EXT4_SB(sb)->s_fc_stats.fc_numblks; ), TP_printk("dev %d,%d fc ineligible reasons:\n" "%s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u" "num_commits:%lu, ineligible: %lu, numblks: %lu", MAJOR(__entry->dev), MINOR(__entry->dev), FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR), FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME), FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE), FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM), FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT), FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE), FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR), FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE), FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA), FC_REASON_NAME_STAT(EXT4_FC_REASON_ENCRYPTED_FILENAME), __entry->fc_commits, __entry->fc_ineligible_commits, __entry->fc_numblks) ); DECLARE_EVENT_CLASS(ext4_fc_track_dentry, TP_PROTO(handle_t *handle, struct inode *inode, struct dentry *dentry, int ret), TP_ARGS(handle, inode, dentry, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(tid_t, t_tid) __field(ino_t, i_ino) __field(tid_t, i_sync_tid) __field(int, error) ), TP_fast_assign( struct ext4_inode_info *ei = EXT4_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->t_tid = handle->h_transaction->t_tid; __entry->i_ino = inode->i_ino; __entry->i_sync_tid = ei->i_sync_tid; __entry->error = ret; ), TP_printk("dev %d,%d, t_tid %u, ino %lu, i_sync_tid %u, error %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->t_tid, __entry->i_ino, __entry->i_sync_tid, __entry->error ) ); #define DEFINE_EVENT_CLASS_DENTRY(__type) \ DEFINE_EVENT(ext4_fc_track_dentry, ext4_fc_track_##__type, \ TP_PROTO(handle_t *handle, struct inode *inode, \ struct dentry *dentry, int ret), \ TP_ARGS(handle, inode, dentry, ret) \ ) DEFINE_EVENT_CLASS_DENTRY(create); DEFINE_EVENT_CLASS_DENTRY(link); DEFINE_EVENT_CLASS_DENTRY(unlink); TRACE_EVENT(ext4_fc_track_inode, TP_PROTO(handle_t *handle, struct inode *inode, int ret), TP_ARGS(handle, inode, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(tid_t, t_tid) __field(ino_t, i_ino) __field(tid_t, i_sync_tid) __field(int, error) ), TP_fast_assign( struct ext4_inode_info *ei = EXT4_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->t_tid = handle->h_transaction->t_tid; __entry->i_ino = inode->i_ino; __entry->i_sync_tid = ei->i_sync_tid; __entry->error = ret; ), TP_printk("dev %d:%d, t_tid %u, inode %lu, i_sync_tid %u, error %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->t_tid, __entry->i_ino, __entry->i_sync_tid, __entry->error) ); TRACE_EVENT(ext4_fc_track_range, TP_PROTO(handle_t *handle, struct inode *inode, long start, long end, int ret), TP_ARGS(handle, inode, start, end, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(tid_t, t_tid) __field(ino_t, i_ino) __field(tid_t, i_sync_tid) __field(long, start) __field(long, end) __field(int, error) ), TP_fast_assign( struct ext4_inode_info *ei = EXT4_I(inode); __entry->dev = inode->i_sb->s_dev; __entry->t_tid = handle->h_transaction->t_tid; __entry->i_ino = inode->i_ino; __entry->i_sync_tid = ei->i_sync_tid; __entry->start = start; __entry->end = end; __entry->error = ret; ), TP_printk("dev %d:%d, t_tid %u, inode %lu, i_sync_tid %u, error %d, start %ld, end %ld", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->t_tid, __entry->i_ino, __entry->i_sync_tid, __entry->error, __entry->start, __entry->end) ); TRACE_EVENT(ext4_fc_cleanup, TP_PROTO(journal_t *journal, int full, tid_t tid), TP_ARGS(journal, full, tid), TP_STRUCT__entry( __field(dev_t, dev) __field(int, j_fc_off) __field(int, full) __field(tid_t, tid) ), TP_fast_assign( struct super_block *sb = journal->j_private; __entry->dev = sb->s_dev; __entry->j_fc_off = journal->j_fc_off; __entry->full = full; __entry->tid = tid; ), TP_printk("dev %d,%d, j_fc_off %d, full %d, tid %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->j_fc_off, __entry->full, __entry->tid) ); TRACE_EVENT(ext4_update_sb, TP_PROTO(struct super_block *sb, ext4_fsblk_t fsblk, unsigned int flags), TP_ARGS(sb, fsblk, flags), TP_STRUCT__entry( __field(dev_t, dev) __field(ext4_fsblk_t, fsblk) __field(unsigned int, flags) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->fsblk = fsblk; __entry->flags = flags; ), TP_printk("dev %d,%d fsblk %llu flags %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->fsblk, __entry->flags) ); #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
7690 375 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM percpu #if !defined(_TRACE_PERCPU_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PERCPU_H #include <linux/tracepoint.h> #include <trace/events/mmflags.h> TRACE_EVENT(percpu_alloc_percpu, TP_PROTO(unsigned long call_site, bool reserved, bool is_atomic, size_t size, size_t align, void *base_addr, int off, void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags), TP_ARGS(call_site, reserved, is_atomic, size, align, base_addr, off, ptr, bytes_alloc, gfp_flags), TP_STRUCT__entry( __field( unsigned long, call_site ) __field( bool, reserved ) __field( bool, is_atomic ) __field( size_t, size ) __field( size_t, align ) __field( void *, base_addr ) __field( int, off ) __field( void __percpu *, ptr ) __field( size_t, bytes_alloc ) __field( unsigned long, gfp_flags ) ), TP_fast_assign( __entry->call_site = call_site; __entry->reserved = reserved; __entry->is_atomic = is_atomic; __entry->size = size; __entry->align = align; __entry->base_addr = base_addr; __entry->off = off; __entry->ptr = ptr; __entry->bytes_alloc = bytes_alloc; __entry->gfp_flags = (__force unsigned long)gfp_flags; ), TP_printk("call_site=%pS reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p bytes_alloc=%zu gfp_flags=%s", (void *)__entry->call_site, __entry->reserved, __entry->is_atomic, __entry->size, __entry->align, __entry->base_addr, __entry->off, __entry->ptr, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags)) ); TRACE_EVENT(percpu_free_percpu, TP_PROTO(void *base_addr, int off, void __percpu *ptr), TP_ARGS(base_addr, off, ptr), TP_STRUCT__entry( __field( void *, base_addr ) __field( int, off ) __field( void __percpu *, ptr ) ), TP_fast_assign( __entry->base_addr = base_addr; __entry->off = off; __entry->ptr = ptr; ), TP_printk("base_addr=%p off=%d ptr=%p", __entry->base_addr, __entry->off, __entry->ptr) ); TRACE_EVENT(percpu_alloc_percpu_fail, TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align), TP_ARGS(reserved, is_atomic, size, align), TP_STRUCT__entry( __field( bool, reserved ) __field( bool, is_atomic ) __field( size_t, size ) __field( size_t, align ) ), TP_fast_assign( __entry->reserved = reserved; __entry->is_atomic = is_atomic; __entry->size = size; __entry->align = align; ), TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu", __entry->reserved, __entry->is_atomic, __entry->size, __entry->align) ); TRACE_EVENT(percpu_create_chunk, TP_PROTO(void *base_addr), TP_ARGS(base_addr), TP_STRUCT__entry( __field( void *, base_addr ) ), TP_fast_assign( __entry->base_addr = base_addr; ), TP_printk("base_addr=%p", __entry->base_addr) ); TRACE_EVENT(percpu_destroy_chunk, TP_PROTO(void *base_addr), TP_ARGS(base_addr), TP_STRUCT__entry( __field( void *, base_addr ) ), TP_fast_assign( __entry->base_addr = base_addr; ), TP_printk("base_addr=%p", __entry->base_addr) ); #endif /* _TRACE_PERCPU_H */ #include <trace/define_trace.h>
181 1761 1761 181 181 168 35 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2005 Voltaire Inc. All rights reserved. * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved. * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved. * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. */ #include <linux/completion.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/mutex.h> #include <linux/random.h> #include <linux/rbtree.h> #include <linux/igmp.h> #include <linux/xarray.h> #include <linux/inetdevice.h> #include <linux/slab.h> #include <linux/module.h> #include <net/route.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netevent.h> #include <net/tcp.h> #include <net/ipv6.h> #include <net/ip_fib.h> #include <net/ip6_route.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_cm_ib.h> #include <rdma/rdma_netlink.h> #include <rdma/ib.h> #include <rdma/ib_cache.h> #include <rdma/ib_cm.h> #include <rdma/ib_sa.h> #include <rdma/iw_cm.h> #include "core_priv.h" #include "cma_priv.h" #include "cma_trace.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); #define CMA_CM_RESPONSE_TIMEOUT 20 #define CMA_MAX_CM_RETRIES 15 #define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24) #define CMA_IBOE_PACKET_LIFETIME 16 #define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP static const char * const cma_events[] = { [RDMA_CM_EVENT_ADDR_RESOLVED] = "address resolved", [RDMA_CM_EVENT_ADDR_ERROR] = "address error", [RDMA_CM_EVENT_ROUTE_RESOLVED] = "route resolved ", [RDMA_CM_EVENT_ROUTE_ERROR] = "route error", [RDMA_CM_EVENT_CONNECT_REQUEST] = "connect request", [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response", [RDMA_CM_EVENT_CONNECT_ERROR] = "connect error", [RDMA_CM_EVENT_UNREACHABLE] = "unreachable", [RDMA_CM_EVENT_REJECTED] = "rejected", [RDMA_CM_EVENT_ESTABLISHED] = "established", [RDMA_CM_EVENT_DISCONNECTED] = "disconnected", [RDMA_CM_EVENT_DEVICE_REMOVAL] = "device removal", [RDMA_CM_EVENT_MULTICAST_JOIN] = "multicast join", [RDMA_CM_EVENT_MULTICAST_ERROR] = "multicast error", [RDMA_CM_EVENT_ADDR_CHANGE] = "address change", [RDMA_CM_EVENT_TIMEWAIT_EXIT] = "timewait exit", }; static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type); const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event) { size_t index = event; return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ? cma_events[index] : "unrecognized event"; } EXPORT_SYMBOL(rdma_event_msg); const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return ibcm_reject_msg(reason); if (rdma_protocol_iwarp(id->device, id->port_num)) return iwcm_reject_msg(reason); WARN_ON_ONCE(1); return "unrecognized transport"; } EXPORT_SYMBOL(rdma_reject_msg); /** * rdma_is_consumer_reject - return true if the consumer rejected the connect * request. * @id: Communication identifier that received the REJECT event. * @reason: Value returned in the REJECT event status field. */ static bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return reason == IB_CM_REJ_CONSUMER_DEFINED; if (rdma_protocol_iwarp(id->device, id->port_num)) return reason == -ECONNREFUSED; WARN_ON_ONCE(1); return false; } const void *rdma_consumer_reject_data(struct rdma_cm_id *id, struct rdma_cm_event *ev, u8 *data_len) { const void *p; if (rdma_is_consumer_reject(id, ev->status)) { *data_len = ev->param.conn.private_data_len; p = ev->param.conn.private_data; } else { *data_len = 0; p = NULL; } return p; } EXPORT_SYMBOL(rdma_consumer_reject_data); /** * rdma_iw_cm_id() - return the iw_cm_id pointer for this cm_id. * @id: Communication Identifier */ struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); if (id->device->node_type == RDMA_NODE_RNIC) return id_priv->cm_id.iw; return NULL; } EXPORT_SYMBOL(rdma_iw_cm_id); /** * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack. * @res: rdma resource tracking entry pointer */ struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res) { struct rdma_id_private *id_priv = container_of(res, struct rdma_id_private, res); return &id_priv->id; } EXPORT_SYMBOL(rdma_res_to_id); static int cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); static struct ib_client cma_client = { .name = "cma", .add = cma_add_one, .remove = cma_remove_one }; static struct ib_sa_client sa_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); static struct rb_root id_table = RB_ROOT; /* Serialize operations of id_table tree */ static DEFINE_SPINLOCK(id_table_lock); static struct workqueue_struct *cma_wq; static unsigned int cma_pernet_id; struct cma_pernet { struct xarray tcp_ps; struct xarray udp_ps; struct xarray ipoib_ps; struct xarray ib_ps; }; static struct cma_pernet *cma_pernet(struct net *net) { return net_generic(net, cma_pernet_id); } static struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps) { struct cma_pernet *pernet = cma_pernet(net); switch (ps) { case RDMA_PS_TCP: return &pernet->tcp_ps; case RDMA_PS_UDP: return &pernet->udp_ps; case RDMA_PS_IPOIB: return &pernet->ipoib_ps; case RDMA_PS_IB: return &pernet->ib_ps; default: return NULL; } } struct id_table_entry { struct list_head id_list; struct rb_node rb_node; }; struct cma_device { struct list_head list; struct ib_device *device; struct completion comp; refcount_t refcount; struct list_head id_list; enum ib_gid_type *default_gid_type; u8 *default_roce_tos; }; struct rdma_bind_list { enum rdma_ucm_port_space ps; struct hlist_head owners; unsigned short port; }; static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps, struct rdma_bind_list *bind_list, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); return xa_insert(xa, snum, bind_list, GFP_KERNEL); } static struct rdma_bind_list *cma_ps_find(struct net *net, enum rdma_ucm_port_space ps, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); return xa_load(xa, snum); } static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps, int snum) { struct xarray *xa = cma_pernet_xa(net, ps); xa_erase(xa, snum); } enum { CMA_OPTION_AFONLY, }; void cma_dev_get(struct cma_device *cma_dev) { refcount_inc(&cma_dev->refcount); } void cma_dev_put(struct cma_device *cma_dev) { if (refcount_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); } struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, void *cookie) { struct cma_device *cma_dev; struct cma_device *found_cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) if (filter(cma_dev->device, cookie)) { found_cma_dev = cma_dev; break; } if (found_cma_dev) cma_dev_get(found_cma_dev); mutex_unlock(&lock); return found_cma_dev; } int cma_get_default_gid_type(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)]; } int cma_set_default_gid_type(struct cma_device *cma_dev, u32 port, enum ib_gid_type default_gid_type) { unsigned long supported_gids; if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; if (default_gid_type == IB_GID_TYPE_IB && rdma_protocol_roce_eth_encap(cma_dev->device, port)) default_gid_type = IB_GID_TYPE_ROCE; supported_gids = roce_gid_type_mask_support(cma_dev->device, port); if (!(supported_gids & 1 << default_gid_type)) return -EINVAL; cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] = default_gid_type; return 0; } int cma_get_default_roce_tos(struct cma_device *cma_dev, u32 port) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)]; } int cma_set_default_roce_tos(struct cma_device *cma_dev, u32 port, u8 default_roce_tos) { if (!rdma_is_port_valid(cma_dev->device, port)) return -EINVAL; cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)] = default_roce_tos; return 0; } struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) { return cma_dev->device; } /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. * We do this by disabling removal notification while a callback is in process, * and reporting it after the callback completes. */ struct cma_multicast { struct rdma_id_private *id_priv; union { struct ib_sa_multicast *sa_mc; struct { struct work_struct work; struct rdma_cm_event event; } iboe_join; }; struct list_head list; void *context; struct sockaddr_storage addr; u8 join_state; }; struct cma_work { struct work_struct work; struct rdma_id_private *id; enum rdma_cm_state old_state; enum rdma_cm_state new_state; struct rdma_cm_event event; }; union cma_ip_addr { struct in6_addr ip6; struct { __be32 pad[3]; __be32 addr; } ip4; }; struct cma_hdr { u8 cma_version; u8 ip_version; /* IP version: 7:4 */ __be16 port; union cma_ip_addr src_addr; union cma_ip_addr dst_addr; }; #define CMA_VERSION 0x00 struct cma_req_info { struct sockaddr_storage listen_addr_storage; struct sockaddr_storage src_addr_storage; struct ib_device *device; union ib_gid local_gid; __be64 service_id; int port; bool has_gid; u16 pkey; }; static int cma_comp_exch(struct rdma_id_private *id_priv, enum rdma_cm_state comp, enum rdma_cm_state exch) { unsigned long flags; int ret; /* * The FSM uses a funny double locking where state is protected by both * the handler_mutex and the spinlock. State is not allowed to change * to/from a handler_mutex protected value without also holding * handler_mutex. */ if (comp == RDMA_CM_CONNECT || exch == RDMA_CM_CONNECT) lockdep_assert_held(&id_priv->handler_mutex); spin_lock_irqsave(&id_priv->lock, flags); if ((ret = (id_priv->state == comp))) id_priv->state = exch; spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr) { return hdr->ip_version >> 4; } static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) { hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *)&id_priv->id.route.addr.src_addr; } static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv) { return (struct sockaddr *)&id_priv->id.route.addr.dst_addr; } static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) { struct in_device *in_dev = NULL; if (ndev) { rtnl_lock(); in_dev = __in_dev_get_rtnl(ndev); if (in_dev) { if (join) ip_mc_inc_group(in_dev, *(__be32 *)(mgid->raw + 12)); else ip_mc_dec_group(in_dev, *(__be32 *)(mgid->raw + 12)); } rtnl_unlock(); } return (in_dev) ? 0 : -ENODEV; } static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa, struct id_table_entry *entry_b) { struct rdma_id_private *id_priv = list_first_entry( &entry_b->id_list, struct rdma_id_private, id_list_entry); int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if; struct sockaddr *sb = cma_dst_addr(id_priv); if (ifindex_a != ifindex_b) return (ifindex_a > ifindex_b) ? 1 : -1; if (sa->sa_family != sb->sa_family) return sa->sa_family - sb->sa_family; if (sa->sa_family == AF_INET && __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in)) { return memcmp(&((struct sockaddr_in *)sa)->sin_addr, &((struct sockaddr_in *)sb)->sin_addr, sizeof(((struct sockaddr_in *)sa)->sin_addr)); } if (sa->sa_family == AF_INET6 && __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in6)) { return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr, &((struct sockaddr_in6 *)sb)->sin6_addr); } return -1; } static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv) { struct rb_node **new, *parent = NULL; struct id_table_entry *this, *node; unsigned long flags; int result; node = kzalloc(sizeof(*node), GFP_KERNEL); if (!node) return -ENOMEM; spin_lock_irqsave(&id_table_lock, flags); new = &id_table.rb_node; while (*new) { this = container_of(*new, struct id_table_entry, rb_node); result = compare_netdev_and_ip( node_id_priv->id.route.addr.dev_addr.bound_dev_if, cma_dst_addr(node_id_priv), this); parent = *new; if (result < 0) new = &((*new)->rb_left); else if (result > 0) new = &((*new)->rb_right); else { list_add_tail(&node_id_priv->id_list_entry, &this->id_list); kfree(node); goto unlock; } } INIT_LIST_HEAD(&node->id_list); list_add_tail(&node_id_priv->id_list_entry, &node->id_list); rb_link_node(&node->rb_node, parent, new); rb_insert_color(&node->rb_node, &id_table); unlock: spin_unlock_irqrestore(&id_table_lock, flags); return 0; } static struct id_table_entry * node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa) { struct rb_node *node = root->rb_node; struct id_table_entry *data; int result; while (node) { data = container_of(node, struct id_table_entry, rb_node); result = compare_netdev_and_ip(ifindex, sa, data); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return data; } return NULL; } static void cma_remove_id_from_tree(struct rdma_id_private *id_priv) { struct id_table_entry *data; unsigned long flags; spin_lock_irqsave(&id_table_lock, flags); if (list_empty(&id_priv->id_list_entry)) goto out; data = node_from_ndev_ip(&id_table, id_priv->id.route.addr.dev_addr.bound_dev_if, cma_dst_addr(id_priv)); if (!data) goto out; list_del_init(&id_priv->id_list_entry); if (list_empty(&data->id_list)) { rb_erase(&data->rb_node, &id_table); kfree(data); } out: spin_unlock_irqrestore(&id_table_lock, flags); } static void _cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { cma_dev_get(cma_dev); id_priv->cma_dev = cma_dev; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->device_item, &cma_dev->id_list); trace_cm_id_attach(id_priv, cma_dev->device); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev) { _cma_attach_to_dev(id_priv, cma_dev); id_priv->gid_type = cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(cma_dev->device)]; } static void cma_release_dev(struct rdma_id_private *id_priv) { mutex_lock(&lock); list_del_init(&id_priv->device_item); cma_dev_put(id_priv->cma_dev); id_priv->cma_dev = NULL; id_priv->id.device = NULL; if (id_priv->id.route.addr.dev_addr.sgid_attr) { rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr); id_priv->id.route.addr.dev_addr.sgid_attr = NULL; } mutex_unlock(&lock); } static inline unsigned short cma_family(struct rdma_id_private *id_priv) { return id_priv->id.route.addr.src_addr.ss_family; } static int cma_set_default_qkey(struct rdma_id_private *id_priv) { struct ib_sa_mcmember_rec rec; int ret = 0; switch (id_priv->id.ps) { case RDMA_PS_UDP: case RDMA_PS_IB: id_priv->qkey = RDMA_UDP_QKEY; break; case RDMA_PS_IPOIB: ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (!ret) id_priv->qkey = be32_to_cpu(rec.qkey); break; default: break; } return ret; } static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey) { if (!qkey || (id_priv->qkey && (id_priv->qkey != qkey))) return -EINVAL; id_priv->qkey = qkey; return 0; } static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr) { dev_addr->dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr); ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey)); } static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr) { int ret; if (addr->sa_family != AF_IB) { ret = rdma_translate_ip(addr, dev_addr); } else { cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); ret = 0; } return ret; } static const struct ib_gid_attr * cma_validate_port(struct ib_device *device, u32 port, enum ib_gid_type gid_type, union ib_gid *gid, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr = ERR_PTR(-ENODEV); int bound_if_index = dev_addr->bound_dev_if; int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) goto out; if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) goto out; if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) goto out; /* * For drivers that do not associate more than one net device with * their gid tables, such as iWARP drivers, it is sufficient to * return the first table entry. * * Other driver classes might be included in the future. */ if (rdma_protocol_iwarp(device, port)) { sgid_attr = rdma_get_gid_attr(device, port, 0); if (IS_ERR(sgid_attr)) goto out; rcu_read_lock(); ndev = rcu_dereference(sgid_attr->ndev); if (!net_eq(dev_net(ndev), dev_addr->net) || ndev->ifindex != bound_if_index) sgid_attr = ERR_PTR(-ENODEV); rcu_read_unlock(); goto out; } if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { ndev = dev_get_by_index(dev_addr->net, bound_if_index); if (!ndev) goto out; } else { gid_type = IB_GID_TYPE_IB; } sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev); dev_put(ndev); out: return sgid_attr; } static void cma_bind_sgid_attr(struct rdma_id_private *id_priv, const struct ib_gid_attr *sgid_attr) { WARN_ON(id_priv->id.route.addr.dev_addr.sgid_attr); id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr; } /** * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute * based on source ip address. * @id_priv: cm_id which should be bound to cma device * * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute * based on source IP address. It returns 0 on success or error code otherwise. * It is applicable to active and passive side cm_id. */ static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; union ib_gid gid, iboe_gid, *gidp; struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) { rdma_for_each_port (cma_dev->device, port) { gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, gidp, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); cma_attach_to_dev(id_priv, cma_dev); ret = 0; goto out; } } } out: mutex_unlock(&lock); return ret; } /** * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute * @id_priv: cm id to bind to cma device * @listen_id_priv: listener cm id to match against * @req: Pointer to req structure containaining incoming * request information * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when * rdma device matches for listen_id and incoming request. It also verifies * that a GID table entry is present for the source address. * Returns 0 on success, or returns error code otherwise. */ static int cma_ib_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv, struct cma_req_info *req) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; enum ib_gid_type gid_type; union ib_gid gid; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; if (rdma_protocol_roce(req->device, req->port)) rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &gid); else memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1]; sgid_attr = cma_validate_port(req->device, req->port, gid_type, &gid, id_priv); if (IS_ERR(sgid_attr)) return PTR_ERR(sgid_attr); id_priv->id.port_num = req->port; cma_bind_sgid_attr(id_priv, sgid_attr); /* Need to acquire lock to protect against reader * of cma_dev->id_list such as cma_netdev_callback() and * cma_process_remove(). */ mutex_lock(&lock); cma_attach_to_dev(id_priv, listen_id_priv->cma_dev); mutex_unlock(&lock); rdma_restrack_add(&id_priv->res); return 0; } static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; union ib_gid gid; u32 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof(gid)); mutex_lock(&lock); cma_dev = listen_id_priv->cma_dev; port = listen_id_priv->id.port_num; gid_type = listen_id_priv->gid_type; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); ret = 0; goto out; } list_for_each_entry(cma_dev, &dev_list, list) { rdma_for_each_port (cma_dev->device, port) { if (listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); ret = 0; goto out; } } } out: if (!ret) { cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); } mutex_unlock(&lock); return ret; } /* * Select the source IB device and address to reach the destination IB address. */ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; struct sockaddr_ib *addr; union ib_gid gid, sgid, *dgid; unsigned int p; u16 pkey, index; enum ib_port_state port_state; int ret; int i; cma_dev = NULL; addr = (struct sockaddr_ib *) cma_dst_addr(id_priv); dgid = (union ib_gid *) &addr->sib_addr; pkey = ntohs(addr->sib_pkey); mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { rdma_for_each_port (cur_dev->device, p) { if (!rdma_cap_af_ib(cur_dev->device, p)) continue; if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index)) continue; if (ib_get_cached_port_state(cur_dev->device, p, &port_state)) continue; for (i = 0; i < cur_dev->device->port_data[p].immutable.gid_tbl_len; ++i) { ret = rdma_query_gid(cur_dev->device, p, i, &gid); if (ret) continue; if (!memcmp(&gid, dgid, sizeof(gid))) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; goto found; } if (!cma_dev && (gid.global.subnet_prefix == dgid->global.subnet_prefix) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; sgid = gid; id_priv->id.port_num = p; goto found; } } } } mutex_unlock(&lock); return -ENODEV; found: cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); mutex_unlock(&lock); addr = (struct sockaddr_ib *)cma_src_addr(id_priv); memcpy(&addr->sib_addr, &sgid, sizeof(sgid)); cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr); return 0; } static void cma_id_get(struct rdma_id_private *id_priv) { refcount_inc(&id_priv->refcount); } static void cma_id_put(struct rdma_id_private *id_priv) { if (refcount_dec_and_test(&id_priv->refcount)) complete(&id_priv->comp); } static struct rdma_id_private * __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type, const struct rdma_id_private *parent) { struct rdma_id_private *id_priv; id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL); if (!id_priv) return ERR_PTR(-ENOMEM); id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->tos_set = false; id_priv->timeout_set = false; id_priv->min_rnr_timer_set = false; id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); refcount_set(&id_priv->refcount, 1); mutex_init(&id_priv->handler_mutex); INIT_LIST_HEAD(&id_priv->device_item); INIT_LIST_HEAD(&id_priv->id_list_entry); INIT_LIST_HEAD(&id_priv->listen_list); INIT_LIST_HEAD(&id_priv->mc_list); get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num); id_priv->id.route.addr.dev_addr.net = get_net(net); id_priv->seq_num &= 0x00ffffff; rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID); if (parent) rdma_restrack_parent_name(&id_priv->res, &parent->res); return id_priv; } struct rdma_cm_id * __rdma_create_kernel_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type, const char *caller) { struct rdma_id_private *ret; ret = __rdma_create_id(net, event_handler, context, ps, qp_type, NULL); if (IS_ERR(ret)) return ERR_CAST(ret); rdma_restrack_set_name(&ret->res, caller); return &ret->id; } EXPORT_SYMBOL(__rdma_create_kernel_id); struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler, void *context, enum rdma_ucm_port_space ps, enum ib_qp_type qp_type) { struct rdma_id_private *ret; ret = __rdma_create_id(current->nsproxy->net_ns, event_handler, context, ps, qp_type, NULL); if (IS_ERR(ret)) return ERR_CAST(ret); rdma_restrack_set_name(&ret->res, NULL); return &ret->id; } EXPORT_SYMBOL(rdma_create_user_id); static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); if (ret) return ret; qp_attr.qp_state = IB_QPS_RTS; qp_attr.sq_psn = 0; ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); return ret; } static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) return ret; return ib_modify_qp(qp, &qp_attr, qp_attr_mask); } int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { struct rdma_id_private *id_priv; struct ib_qp *qp; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (id->device != pd->device) { ret = -EINVAL; goto out_err; } qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) { ret = PTR_ERR(qp); goto out_err; } if (id->qp_type == IB_QPT_UD) ret = cma_init_ud_qp(id_priv, qp); else ret = cma_init_conn_qp(id_priv, qp); if (ret) goto out_destroy; id->qp = qp; id_priv->qp_num = qp->qp_num; id_priv->srq = (qp->srq != NULL); trace_cm_qp_create(id_priv, pd, qp_init_attr, 0); return 0; out_destroy: ib_destroy_qp(qp); out_err: trace_cm_qp_create(id_priv, pd, qp_init_attr, ret); return ret; } EXPORT_SYMBOL(rdma_create_qp); void rdma_destroy_qp(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); trace_cm_qp_destroy(id_priv); mutex_lock(&id_priv->qp_mutex); ib_destroy_qp(id_priv->id.qp); id_priv->id.qp = NULL; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_destroy_qp); static int cma_modify_qp_rtr(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } /* Need to update QP attributes from default values. */ qp_attr.qp_state = IB_QPS_INIT; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); if (ret) goto out; qp_attr.qp_state = IB_QPS_RTR; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; BUG_ON(id_priv->cma_dev->device != id_priv->id.device); if (conn_param) qp_attr.max_dest_rd_atomic = conn_param->responder_resources; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_rts(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_RTS; ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) goto out; if (conn_param) qp_attr.max_rd_atomic = conn_param->initiator_depth; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_modify_qp_err(struct rdma_id_private *id_priv) { struct ib_qp_attr qp_attr; int ret; mutex_lock(&id_priv->qp_mutex); if (!id_priv->id.qp) { ret = 0; goto out; } qp_attr.qp_state = IB_QPS_ERR; ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE); out: mutex_unlock(&id_priv->qp_mutex); return ret; } static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int ret; u16 pkey; if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num)) pkey = 0xffff; else pkey = ib_addr_get_pkey(dev_addr); ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num, pkey, &qp_attr->pkey_index); if (ret) return ret; qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT; if (id_priv->id.qp_type == IB_QPT_UD) { ret = cma_set_default_qkey(id_priv); if (ret) return ret; qp_attr->qkey = id_priv->qkey; *qp_attr_mask |= IB_QP_QKEY; } else { qp_attr->qp_access_flags = 0; *qp_attr_mask |= IB_QP_ACCESS_FLAGS; } return 0; } int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, int *qp_attr_mask) { struct rdma_id_private *id_priv; int ret = 0; id_priv = container_of(id, struct rdma_id_private, id); if (rdma_cap_ib_cm(id->device, id->port_num)) { if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD)) ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask); else ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr, qp_attr_mask); if (qp_attr->qp_state == IB_QPS_RTR) qp_attr->rq_psn = id_priv->seq_num; } else if (rdma_cap_iw_cm(id->device, id->port_num)) { if (!id_priv->cm_id.iw) { qp_attr->qp_access_flags = 0; *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS; } else ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr, qp_attr_mask); qp_attr->port_num = id_priv->id.port_num; *qp_attr_mask |= IB_QP_PORT; } else { ret = -ENOSYS; } if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set) qp_attr->timeout = id_priv->timeout; if ((*qp_attr_mask & IB_QP_MIN_RNR_TIMER) && id_priv->min_rnr_timer_set) qp_attr->min_rnr_timer = id_priv->min_rnr_timer; return ret; } EXPORT_SYMBOL(rdma_init_qp_attr); static inline bool cma_zero_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_any(&((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: return ib_addr_any(&((struct sockaddr_ib *)addr)->sib_addr); default: return false; } } static inline bool cma_loopback_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_loopback( ((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: return ipv6_addr_loopback( &((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: return ib_addr_loopback( &((struct sockaddr_ib *)addr)->sib_addr); default: return false; } } static inline bool cma_any_addr(const struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst) { if (src->sa_family != dst->sa_family) return -1; switch (src->sa_family) { case AF_INET: return ((struct sockaddr_in *)src)->sin_addr.s_addr != ((struct sockaddr_in *)dst)->sin_addr.s_addr; case AF_INET6: { struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src; struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst; bool link_local; if (ipv6_addr_cmp(&src_addr6->sin6_addr, &dst_addr6->sin6_addr)) return 1; link_local = ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL; /* Link local must match their scope_ids */ return link_local ? (src_addr6->sin6_scope_id != dst_addr6->sin6_scope_id) : 0; } default: return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, &((struct sockaddr_ib *) dst)->sib_addr); } } static __be16 cma_port(const struct sockaddr *addr) { struct sockaddr_ib *sib; switch (addr->sa_family) { case AF_INET: return ((struct sockaddr_in *) addr)->sin_port; case AF_INET6: return ((struct sockaddr_in6 *) addr)->sin6_port; case AF_IB: sib = (struct sockaddr_ib *) addr; return htons((u16) (be64_to_cpu(sib->sib_sid) & be64_to_cpu(sib->sib_sid_mask))); default: return 0; } } static inline int cma_any_port(const struct sockaddr *addr) { return !cma_port(addr); } static void cma_save_ib_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct rdma_cm_id *listen_id, const struct sa_path_rec *path) { struct sockaddr_ib *listen_ib, *ib; listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr; if (src_addr) { ib = (struct sockaddr_ib *)src_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->sgid, 16); ib->sib_sid = path->service_id; ib->sib_scope_id = 0; } else { ib->sib_pkey = listen_ib->sib_pkey; ib->sib_flowinfo = listen_ib->sib_flowinfo; ib->sib_addr = listen_ib->sib_addr; ib->sib_sid = listen_ib->sib_sid; ib->sib_scope_id = listen_ib->sib_scope_id; } ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); } if (dst_addr) { ib = (struct sockaddr_ib *)dst_addr; ib->sib_family = AF_IB; if (path) { ib->sib_pkey = path->pkey; ib->sib_flowinfo = path->flow_label; memcpy(&ib->sib_addr, &path->dgid, 16); } } } static void cma_save_ip4_info(struct sockaddr_in *src_addr, struct sockaddr_in *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr.s_addr = hdr->dst_addr.ip4.addr, .sin_port = local_port, }; } if (dst_addr) { *dst_addr = (struct sockaddr_in) { .sin_family = AF_INET, .sin_addr.s_addr = hdr->src_addr.ip4.addr, .sin_port = hdr->port, }; } } static void cma_save_ip6_info(struct sockaddr_in6 *src_addr, struct sockaddr_in6 *dst_addr, struct cma_hdr *hdr, __be16 local_port) { if (src_addr) { *src_addr = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = hdr->dst_addr.ip6, .sin6_port = local_port, }; } if (dst_addr) { *dst_addr = (struct sockaddr_in6) { .sin6_family = AF_INET6, .sin6_addr = hdr->src_addr.ip6, .sin6_port = hdr->port, }; } } static u16 cma_port_from_service_id(__be64 service_id) { return (u16)be64_to_cpu(service_id); } static int cma_save_ip_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct ib_cm_event *ib_event, __be64 service_id) { struct cma_hdr *hdr; __be16 port; hdr = ib_event->private_data; if (hdr->cma_version != CMA_VERSION) return -EINVAL; port = htons(cma_port_from_service_id(service_id)); switch (cma_get_ip_ver(hdr)) { case 4: cma_save_ip4_info((struct sockaddr_in *)src_addr, (struct sockaddr_in *)dst_addr, hdr, port); break; case 6: cma_save_ip6_info((struct sockaddr_in6 *)src_addr, (struct sockaddr_in6 *)dst_addr, hdr, port); break; default: return -EAFNOSUPPORT; } return 0; } static int cma_save_net_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, sa_family_t sa_family, __be64 service_id) { if (sa_family == AF_IB) { if (ib_event->event == IB_CM_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, ib_event->param.req_rcvd.primary_path); else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) cma_save_ib_info(src_addr, dst_addr, listen_id, NULL); return 0; } return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id); } static int cma_save_req_info(const struct ib_cm_event *ib_event, struct cma_req_info *req) { const struct ib_cm_req_event_param *req_param = &ib_event->param.req_rcvd; const struct ib_cm_sidr_req_event_param *sidr_param = &ib_event->param.sidr_req_rcvd; switch (ib_event->event) { case IB_CM_REQ_RECEIVED: req->device = req_param->listen_id->device; req->port = req_param->port; memcpy(&req->local_gid, &req_param->primary_path->sgid, sizeof(req->local_gid)); req->has_gid = true; req->service_id = req_param->primary_path->service_id; req->pkey = be16_to_cpu(req_param->primary_path->pkey); if (req->pkey != req_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", req_param->bth_pkey, req->pkey); break; case IB_CM_SIDR_REQ_RECEIVED: req->device = sidr_param->listen_id->device; req->port = sidr_param->port; req->has_gid = false; req->service_id = sidr_param->service_id; req->pkey = sidr_param->pkey; if (req->pkey != sidr_param->bth_pkey) pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n" "RDMA CMA: in the future this may cause the request to be dropped\n", sidr_param->bth_pkey, req->pkey); break; default: return -EINVAL; } return 0; } static bool validate_ipv4_net_dev(struct net_device *net_dev, const struct sockaddr_in *dst_addr, const struct sockaddr_in *src_addr) { __be32 daddr = dst_addr->sin_addr.s_addr, saddr = src_addr->sin_addr.s_addr; struct fib_result res; struct flowi4 fl4; int err; bool ret; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) || ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) || ipv4_is_loopback(saddr)) return false; memset(&fl4, 0, sizeof(fl4)); fl4.flowi4_oif = net_dev->ifindex; fl4.daddr = daddr; fl4.saddr = saddr; rcu_read_lock(); err = fib_lookup(dev_net(net_dev), &fl4, &res, 0); ret = err == 0 && FIB_RES_DEV(res) == net_dev; rcu_read_unlock(); return ret; } static bool validate_ipv6_net_dev(struct net_device *net_dev, const struct sockaddr_in6 *dst_addr, const struct sockaddr_in6 *src_addr) { #if IS_ENABLED(CONFIG_IPV6) const int strict = ipv6_addr_type(&dst_addr->sin6_addr) & IPV6_ADDR_LINKLOCAL; struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr, &src_addr->sin6_addr, net_dev->ifindex, NULL, strict); bool ret; if (!rt) return false; ret = rt->rt6i_idev->dev == net_dev; ip6_rt_put(rt); return ret; #else return false; #endif } static bool validate_net_dev(struct net_device *net_dev, const struct sockaddr *daddr, const struct sockaddr *saddr) { const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr; const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr; const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr; switch (daddr->sa_family) { case AF_INET: return saddr->sa_family == AF_INET && validate_ipv4_net_dev(net_dev, daddr4, saddr4); case AF_INET6: return saddr->sa_family == AF_INET6 && validate_ipv6_net_dev(net_dev, daddr6, saddr6); default: return false; } } static struct net_device * roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event) { const struct ib_gid_attr *sgid_attr = NULL; struct net_device *ndev; if (ib_event->event == IB_CM_REQ_RECEIVED) sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr; else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) sgid_attr = ib_event->param.sidr_req_rcvd.sgid_attr; if (!sgid_attr) return NULL; rcu_read_lock(); ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr); if (IS_ERR(ndev)) ndev = NULL; else dev_hold(ndev); rcu_read_unlock(); return ndev; } static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event, struct cma_req_info *req) { struct sockaddr *listen_addr = (struct sockaddr *)&req->listen_addr_storage; struct sockaddr *src_addr = (struct sockaddr *)&req->src_addr_storage; struct net_device *net_dev; const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL; int err; err = cma_save_ip_info(listen_addr, src_addr, ib_event, req->service_id); if (err) return ERR_PTR(err); if (rdma_protocol_roce(req->device, req->port)) net_dev = roce_get_net_dev_by_cm_event(ib_event); else net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey, gid, listen_addr); if (!net_dev) return ERR_PTR(-ENODEV); return net_dev; } static enum rdma_ucm_port_space rdma_ps_from_service_id(__be64 service_id) { return (be64_to_cpu(service_id) >> 16) & 0xffff; } static bool cma_match_private_data(struct rdma_id_private *id_priv, const struct cma_hdr *hdr) { struct sockaddr *addr = cma_src_addr(id_priv); __be32 ip4_addr; struct in6_addr ip6_addr; if (cma_any_addr(addr) && !id_priv->afonly) return true; switch (addr->sa_family) { case AF_INET: ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr; if (cma_get_ip_ver(hdr) != 4) return false; if (!cma_any_addr(addr) && hdr->dst_addr.ip4.addr != ip4_addr) return false; break; case AF_INET6: ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr; if (cma_get_ip_ver(hdr) != 6) return false; if (!cma_any_addr(addr) && memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr))) return false; break; case AF_IB: return true; default: return false; } return true; } static bool cma_protocol_roce(const struct rdma_cm_id *id) { struct ib_device *device = id->device; const u32 port_num = id->port_num ?: rdma_start_port(device); return rdma_protocol_roce(device, port_num); } static bool cma_is_req_ipv6_ll(const struct cma_req_info *req) { const struct sockaddr *daddr = (const struct sockaddr *)&req->listen_addr_storage; const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; /* Returns true if the req is for IPv6 link local */ return (daddr->sa_family == AF_INET6 && (ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)); } static bool cma_match_net_dev(const struct rdma_cm_id *id, const struct net_device *net_dev, const struct cma_req_info *req) { const struct rdma_addr *addr = &id->route.addr; if (!net_dev) /* This request is an AF_IB request */ return (!id->port_num || id->port_num == req->port) && (addr->src_addr.ss_family == AF_IB); /* * If the request is not for IPv6 link local, allow matching * request to any netdevice of the one or multiport rdma device. */ if (!cma_is_req_ipv6_ll(req)) return true; /* * Net namespaces must match, and if the listner is listening * on a specific netdevice than netdevice must match as well. */ if (net_eq(dev_net(net_dev), addr->dev_addr.net) && (!!addr->dev_addr.bound_dev_if == (addr->dev_addr.bound_dev_if == net_dev->ifindex))) return true; else return false; } static struct rdma_id_private *cma_find_listener( const struct rdma_bind_list *bind_list, const struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, const struct cma_req_info *req, const struct net_device *net_dev) { struct rdma_id_private *id_priv, *id_priv_dev; lockdep_assert_held(&lock); if (!bind_list) return ERR_PTR(-EINVAL); hlist_for_each_entry(id_priv, &bind_list->owners, node) { if (cma_match_private_data(id_priv, ib_event->private_data)) { if (id_priv->id.device == cm_id->device && cma_match_net_dev(&id_priv->id, net_dev, req)) return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, listen_item) { if (id_priv_dev->id.device == cm_id->device && cma_match_net_dev(&id_priv_dev->id, net_dev, req)) return id_priv_dev; } } } return ERR_PTR(-EINVAL); } static struct rdma_id_private * cma_ib_id_from_event(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, struct cma_req_info *req, struct net_device **net_dev) { struct rdma_bind_list *bind_list; struct rdma_id_private *id_priv; int err; err = cma_save_req_info(ib_event, req); if (err) return ERR_PTR(err); *net_dev = cma_get_net_dev(ib_event, req); if (IS_ERR(*net_dev)) { if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */ *net_dev = NULL; } else { return ERR_CAST(*net_dev); } } mutex_lock(&lock); /* * Net namespace might be getting deleted while route lookup, * cm_id lookup is in progress. Therefore, perform netdevice * validation, cm_id lookup under rcu lock. * RCU lock along with netdevice state check, synchronizes with * netdevice migrating to different net namespace and also avoids * case where net namespace doesn't get deleted while lookup is in * progress. * If the device state is not IFF_UP, its properties such as ifindex * and nd_net cannot be trusted to remain valid without rcu lock. * net/core/dev.c change_net_namespace() ensures to synchronize with * ongoing operations on net device after device is closed using * synchronize_net(). */ rcu_read_lock(); if (*net_dev) { /* * If netdevice is down, it is likely that it is administratively * down or it might be migrating to different namespace. * In that case avoid further processing, as the net namespace * or ifindex may change. */ if (((*net_dev)->flags & IFF_UP) == 0) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } if (!validate_net_dev(*net_dev, (struct sockaddr *)&req->src_addr_storage, (struct sockaddr *)&req->listen_addr_storage)) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } } bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, rdma_ps_from_service_id(req->service_id), cma_port_from_service_id(req->service_id)); id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); err: rcu_read_unlock(); mutex_unlock(&lock); if (IS_ERR(id_priv) && *net_dev) { dev_put(*net_dev); *net_dev = NULL; } return id_priv; } static inline u8 cma_user_data_offset(struct rdma_id_private *id_priv) { return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr); } static void cma_cancel_route(struct rdma_id_private *id_priv) { if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) { if (id_priv->query) ib_sa_cancel_query(id_priv->query_id, id_priv->query); } } static void _cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; lockdep_assert_held(&lock); /* * Remove from listen_any_list to prevent added devices from spawning * additional listen requests. */ list_del_init(&id_priv->listen_any_item); while (!list_empty(&id_priv->listen_list)) { dev_id_priv = list_first_entry(&id_priv->listen_list, struct rdma_id_private, listen_item); /* sync with device removal to avoid duplicate destruction */ list_del_init(&dev_id_priv->device_item); list_del_init(&dev_id_priv->listen_item); mutex_unlock(&lock); rdma_destroy_id(&dev_id_priv->id); mutex_lock(&lock); } } static void cma_cancel_listens(struct rdma_id_private *id_priv) { mutex_lock(&lock); _cma_cancel_listens(id_priv); mutex_unlock(&lock); } static void cma_cancel_operation(struct rdma_id_private *id_priv, enum rdma_cm_state state) { switch (state) { case RDMA_CM_ADDR_QUERY: /* * We can avoid doing the rdma_addr_cancel() based on state, * only RDMA_CM_ADDR_QUERY has a work that could still execute. * Notice that the addr_handler work could still be exiting * outside this state, however due to the interaction with the * handler_mutex the work is guaranteed not to touch id_priv * during exit. */ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr); break; case RDMA_CM_ROUTE_QUERY: cma_cancel_route(id_priv); break; case RDMA_CM_LISTEN: if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev) cma_cancel_listens(id_priv); break; default: break; } } static void cma_release_port(struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list = id_priv->bind_list; struct net *net = id_priv->id.route.addr.dev_addr.net; if (!bind_list) return; mutex_lock(&lock); hlist_del(&id_priv->node); if (hlist_empty(&bind_list->owners)) { cma_ps_remove(net, bind_list->ps, bind_list->port); kfree(bind_list); } mutex_unlock(&lock); } static void destroy_mc(struct rdma_id_private *id_priv, struct cma_multicast *mc) { bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num)) ib_sa_free_multicast(mc->sa_mc); if (rdma_protocol_roce(id_priv->id.device, id_priv->id.port_num)) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct net_device *ndev = NULL; if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (ndev && !send_only) { enum ib_gid_type gid_type; union ib_gid mgid; gid_type = id_priv->cma_dev->default_gid_type [id_priv->id.port_num - rdma_start_port( id_priv->cma_dev->device)]; cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid, gid_type); cma_igmp_send(ndev, &mgid, false); } dev_put(ndev); cancel_work_sync(&mc->iboe_join.work); } kfree(mc); } static void cma_leave_mc_groups(struct rdma_id_private *id_priv) { struct cma_multicast *mc; while (!list_empty(&id_priv->mc_list)) { mc = list_first_entry(&id_priv->mc_list, struct cma_multicast, list); list_del(&mc->list); destroy_mc(id_priv, mc); } } static void _destroy_id(struct rdma_id_private *id_priv, enum rdma_cm_state state) { cma_cancel_operation(id_priv, state); rdma_restrack_del(&id_priv->res); cma_remove_id_from_tree(id_priv); if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) ib_destroy_cm_id(id_priv->cm_id.ib); } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.iw) iw_destroy_cm_id(id_priv->cm_id.iw); } cma_leave_mc_groups(id_priv); cma_release_dev(id_priv); } cma_release_port(id_priv); cma_id_put(id_priv); wait_for_completion(&id_priv->comp); if (id_priv->internal_id) cma_id_put(id_priv->id.context); kfree(id_priv->id.route.path_rec); kfree(id_priv->id.route.path_rec_inbound); kfree(id_priv->id.route.path_rec_outbound); put_net(id_priv->id.route.addr.dev_addr.net); kfree(id_priv); } /* * destroy an ID from within the handler_mutex. This ensures that no other * handlers can start running concurrently. */ static void destroy_id_handler_unlock(struct rdma_id_private *id_priv) __releases(&idprv->handler_mutex) { enum rdma_cm_state state; unsigned long flags; trace_cm_id_destroy(id_priv); /* * Setting the state to destroyed under the handler mutex provides a * fence against calling handler callbacks. If this is invoked due to * the failure of a handler callback then it guarentees that no future * handlers will be called. */ lockdep_assert_held(&id_priv->handler_mutex); spin_lock_irqsave(&id_priv->lock, flags); state = id_priv->state; id_priv->state = RDMA_CM_DESTROYING; spin_unlock_irqrestore(&id_priv->lock, flags); mutex_unlock(&id_priv->handler_mutex); _destroy_id(id_priv, state); } void rdma_destroy_id(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->handler_mutex); destroy_id_handler_unlock(id_priv); } EXPORT_SYMBOL(rdma_destroy_id); static int cma_rep_recv(struct rdma_id_private *id_priv) { int ret; ret = cma_modify_qp_rtr(id_priv, NULL); if (ret) goto reject; ret = cma_modify_qp_rts(id_priv, NULL); if (ret) goto reject; trace_cm_send_rtu(id_priv); ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0); if (ret) goto reject; return 0; reject: pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret); cma_modify_qp_err(id_priv); trace_cm_send_rej(id_priv); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; } static void cma_set_rep_event_data(struct rdma_cm_event *event, const struct ib_cm_rep_event_param *rep_data, void *private_data) { event->param.conn.private_data = private_data; event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE; event->param.conn.responder_resources = rep_data->responder_resources; event->param.conn.initiator_depth = rep_data->initiator_depth; event->param.conn.flow_control = rep_data->flow_control; event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; event->ece.vendor_id = rep_data->ece.vendor_id; event->ece.attr_mod = rep_data->ece.attr_mod; } static int cma_cm_event_handler(struct rdma_id_private *id_priv, struct rdma_cm_event *event) { int ret; lockdep_assert_held(&id_priv->handler_mutex); trace_cm_event_handler(id_priv, event); ret = id_priv->id.event_handler(&id_priv->id, event); trace_cm_event_done(id_priv, event, ret); return ret; } static int cma_ib_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; enum rdma_cm_state state; int ret; mutex_lock(&id_priv->handler_mutex); state = READ_ONCE(id_priv->state); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && state != RDMA_CM_DISCONNECT)) goto out; switch (ib_event->event) { case IB_CM_REQ_ERROR: case IB_CM_REP_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_REP_RECEIVED: if (state == RDMA_CM_CONNECT && (id_priv->id.qp_type != IB_QPT_UD)) { trace_cm_send_mra(id_priv); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } if (id_priv->id.qp) { event.status = cma_rep_recv(id_priv); event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR : RDMA_CM_EVENT_ESTABLISHED; } else { event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; } cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd, ib_event->private_data); break; case IB_CM_RTU_RECEIVED: case IB_CM_USER_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; break; case IB_CM_DREQ_ERROR: event.status = -ETIMEDOUT; fallthrough; case IB_CM_DREQ_RECEIVED: case IB_CM_DREP_RECEIVED: if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_DISCONNECT)) goto out; event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IB_CM_TIMEWAIT_EXIT: event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT; break; case IB_CM_MRA_RECEIVED: /* ignore event */ goto out; case IB_CM_REJ_RECEIVED: pr_debug_ratelimited("RDMA CM: REJECTED: %s\n", rdma_reject_msg(&id_priv->id, ib_event->param.rej_rcvd.reason)); cma_modify_qp_err(id_priv); event.status = ib_event->param.rej_rcvd.reason; event.event = RDMA_CM_EVENT_REJECTED; event.param.conn.private_data = ib_event->private_data; event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; break; default: pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return 0; } static struct rdma_id_private * cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, struct net_device *net_dev) { struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; struct rdma_route *rt; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct sa_path_rec *path = ib_event->param.req_rcvd.primary_path; const __be64 service_id = ib_event->param.req_rcvd.primary_path->service_id; int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); id_priv = __rdma_create_id(listen_id->route.addr.dev_addr.net, listen_id->event_handler, listen_id->context, listen_id->ps, ib_event->param.req_rcvd.qp_type, listen_id_priv); if (IS_ERR(id_priv)) return NULL; id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, service_id)) goto err; rt = &id->route; rt->num_pri_alt_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1; rt->path_rec = kmalloc_array(rt->num_pri_alt_paths, sizeof(*rt->path_rec), GFP_KERNEL); if (!rt->path_rec) goto err; rt->path_rec[0] = *path; if (rt->num_pri_alt_paths == 2) rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev); } else { if (!cma_protocol_roce(listen_id) && cma_any_addr(cma_src_addr(id_priv))) { rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND; rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid); ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey)); } else if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr); if (ret) goto err; } } rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid); id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static struct rdma_id_private * cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, const struct ib_cm_event *ib_event, struct net_device *net_dev) { const struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; struct net *net = listen_id->route.addr.dev_addr.net; int ret; listen_id_priv = container_of(listen_id, struct rdma_id_private, id); id_priv = __rdma_create_id(net, listen_id->event_handler, listen_id->context, listen_id->ps, IB_QPT_UD, listen_id_priv); if (IS_ERR(id_priv)) return NULL; id = &id_priv->id; if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr, (struct sockaddr *)&id->route.addr.dst_addr, listen_id, ib_event, ss_family, ib_event->param.sidr_req_rcvd.service_id)) goto err; if (net_dev) { rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev); } else { if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr); if (ret) goto err; } } id_priv->state = RDMA_CM_CONNECT; return id_priv; err: rdma_destroy_id(id); return NULL; } static void cma_set_req_event_data(struct rdma_cm_event *event, const struct ib_cm_req_event_param *req_data, void *private_data, int offset) { event->param.conn.private_data = private_data + offset; event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset; event->param.conn.responder_resources = req_data->responder_resources; event->param.conn.initiator_depth = req_data->initiator_depth; event->param.conn.flow_control = req_data->flow_control; event->param.conn.retry_count = req_data->retry_count; event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; event->ece.vendor_id = req_data->ece.vendor_id; event->ece.attr_mod = req_data->ece.attr_mod; } static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id, const struct ib_cm_event *ib_event) { return (((ib_event->event == IB_CM_REQ_RECEIVED) && (ib_event->param.req_rcvd.qp_type == id->qp_type)) || ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) && (id->qp_type == IB_QPT_UD)) || (!id->qp_type)); } static int cma_ib_req_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event = {}; struct cma_req_info req = {}; struct net_device *net_dev; u8 offset; int ret; listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev); if (IS_ERR(listen_id)) return PTR_ERR(listen_id); trace_cm_req_handler(listen_id, ib_event->event); if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) { ret = -EINVAL; goto net_dev_put; } mutex_lock(&listen_id->handler_mutex); if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) { ret = -ECONNABORTED; goto err_unlock; } offset = cma_user_data_offset(listen_id); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { conn_id = cma_ib_new_udp_id(&listen_id->id, ib_event, net_dev); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { conn_id = cma_ib_new_conn_id(&listen_id->id, ib_event, net_dev); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } if (!conn_id) { ret = -ENOMEM; goto err_unlock; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); ret = cma_ib_acquire_dev(conn_id, listen_id, &req); if (ret) { destroy_id_handler_unlock(conn_id); goto err_unlock; } conn_id->cm_id.ib = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_ib_handler; ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ conn_id->cm_id.ib = NULL; mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); goto net_dev_put; } if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT && conn_id->id.qp_type != IB_QPT_UD) { trace_cm_send_mra(cm_id->context); ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0); } mutex_unlock(&conn_id->handler_mutex); err_unlock: mutex_unlock(&listen_id->handler_mutex); net_dev_put: dev_put(net_dev); return ret; } __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) { if (addr->sa_family == AF_IB) return ((struct sockaddr_ib *) addr)->sib_sid; return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr))); } EXPORT_SYMBOL(rdma_get_service_id); void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, union ib_gid *dgid) { struct rdma_addr *addr = &cm_id->route.addr; if (!cm_id->device) { if (sgid) memset(sgid, 0, sizeof(*sgid)); if (dgid) memset(dgid, 0, sizeof(*dgid)); return; } if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) { if (sgid) rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid); if (dgid) rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid); } else { if (sgid) rdma_addr_get_sgid(&addr->dev_addr, sgid); if (dgid) rdma_addr_get_dgid(&addr->dev_addr, dgid); } } EXPORT_SYMBOL(rdma_read_gids); static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; struct rdma_cm_event event = {}; int ret = 0; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; break; case IW_CM_EVENT_CONNECT_REPLY: memcpy(cma_src_addr(id_priv), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(id_priv), raddr, rdma_addr_size(raddr)); switch (iw_event->status) { case 0: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; case -ECONNRESET: case -ECONNREFUSED: event.event = RDMA_CM_EVENT_REJECTED; break; case -ETIMEDOUT: event.event = RDMA_CM_EVENT_UNREACHABLE; break; default: event.event = RDMA_CM_EVENT_CONNECT_ERROR; break; } break; case IW_CM_EVENT_ESTABLISHED: event.event = RDMA_CM_EVENT_ESTABLISHED; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; break; default: goto out; } event.status = iw_event->status; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; ret = cma_cm_event_handler(id_priv, &event); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.iw = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return ret; } static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct iw_cm_event *iw_event) { struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event = {}; int ret = -ECONNABORTED; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; event.param.conn.private_data_len = iw_event->private_data_len; event.param.conn.initiator_depth = iw_event->ird; event.param.conn.responder_resources = iw_event->ord; listen_id = cm_id->context; mutex_lock(&listen_id->handler_mutex); if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) goto out; /* Create a new RDMA id for the new IW CM ID */ conn_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net, listen_id->id.event_handler, listen_id->id.context, RDMA_PS_TCP, IB_QPT_RC, listen_id); if (IS_ERR(conn_id)) { ret = -ENOMEM; goto out; } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); if (ret) { mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } ret = cma_iw_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } conn_id->cm_id.iw = cm_id; cm_id->context = conn_id; cm_id->cm_handler = cma_iw_handler; memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); ret = cma_cm_event_handler(conn_id, &event); if (ret) { /* User wants to destroy the CM ID */ conn_id->cm_id.iw = NULL; mutex_unlock(&listen_id->handler_mutex); destroy_id_handler_unlock(conn_id); return ret; } mutex_unlock(&conn_id->handler_mutex); out: mutex_unlock(&listen_id->handler_mutex); return ret; } static int cma_ib_listen(struct rdma_id_private *id_priv) { struct sockaddr *addr; struct ib_cm_id *id; __be64 svc_id; addr = cma_src_addr(id_priv); svc_id = rdma_get_service_id(&id_priv->id, addr); id = ib_cm_insert_listen(id_priv->id.device, cma_ib_req_handler, svc_id); if (IS_ERR(id)) return PTR_ERR(id); id_priv->cm_id.ib = id; return 0; } static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog) { int ret; struct iw_cm_id *id; id = iw_create_cm_id(id_priv->id.device, iw_conn_req_handler, id_priv); if (IS_ERR(id)) return PTR_ERR(id); mutex_lock(&id_priv->qp_mutex); id->tos = id_priv->tos; id->tos_set = id_priv->tos_set; mutex_unlock(&id_priv->qp_mutex); id->afonly = id_priv->afonly; id_priv->cm_id.iw = id; memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); ret = iw_cm_listen(id_priv->cm_id.iw, backlog); if (ret) { iw_destroy_cm_id(id_priv->cm_id.iw); id_priv->cm_id.iw = NULL; } return ret; } static int cma_listen_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct rdma_id_private *id_priv = id->context; /* Listening IDs are always destroyed on removal */ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) return -1; id->context = id_priv->id.context; id->event_handler = id_priv->id.event_handler; trace_cm_event_handler(id_priv, event); return id_priv->id.event_handler(id, event); } static int cma_listen_on_dev(struct rdma_id_private *id_priv, struct cma_device *cma_dev, struct rdma_id_private **to_destroy) { struct rdma_id_private *dev_id_priv; struct net *net = id_priv->id.route.addr.dev_addr.net; int ret; lockdep_assert_held(&lock); *to_destroy = NULL; if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) return 0; dev_id_priv = __rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps, id_priv->id.qp_type, id_priv); if (IS_ERR(dev_id_priv)) return PTR_ERR(dev_id_priv); dev_id_priv->state = RDMA_CM_ADDR_BOUND; memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); _cma_attach_to_dev(dev_id_priv, cma_dev); rdma_restrack_add(&dev_id_priv->res); cma_id_get(id_priv); dev_id_priv->internal_id = 1; dev_id_priv->afonly = id_priv->afonly; mutex_lock(&id_priv->qp_mutex); dev_id_priv->tos_set = id_priv->tos_set; dev_id_priv->tos = id_priv->tos; mutex_unlock(&id_priv->qp_mutex); ret = rdma_listen(&dev_id_priv->id, id_priv->backlog); if (ret) goto err_listen; list_add_tail(&dev_id_priv->listen_item, &id_priv->listen_list); return 0; err_listen: /* Caller must destroy this after releasing lock */ *to_destroy = dev_id_priv; dev_warn(&cma_dev->device->dev, "RDMA CMA: %s, error %d\n", __func__, ret); return ret; } static int cma_listen_on_all(struct rdma_id_private *id_priv) { struct rdma_id_private *to_destroy; struct cma_device *cma_dev; int ret; mutex_lock(&lock); list_add_tail(&id_priv->listen_any_item, &listen_any_list); list_for_each_entry(cma_dev, &dev_list, list) { ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); if (ret) { /* Prevent racing with cma_process_remove() */ if (to_destroy) list_del_init(&to_destroy->device_item); goto err_listen; } } mutex_unlock(&lock); return 0; err_listen: _cma_cancel_listens(id_priv); mutex_unlock(&lock); if (to_destroy) rdma_destroy_id(&to_destroy->id); return ret; } void rdma_set_service_type(struct rdma_cm_id *id, int tos) { struct rdma_id_private *id_priv; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->tos = (u8) tos; id_priv->tos_set = true; mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_set_service_type); /** * rdma_set_ack_timeout() - Set the ack timeout of QP associated * with a connection identifier. * @id: Communication identifier to associated with service type. * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec. * * This function should be called before rdma_connect() on active side, * and on passive side before rdma_accept(). It is applicable to primary * path only. The timeout will affect the local side of the QP, it is not * negotiated with remote side and zero disables the timer. In case it is * set before rdma_resolve_route, the value will also be used to determine * PacketLifeTime for RoCE. * * Return: 0 for success */ int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout) { struct rdma_id_private *id_priv; if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->timeout = timeout; id_priv->timeout_set = true; mutex_unlock(&id_priv->qp_mutex); return 0; } EXPORT_SYMBOL(rdma_set_ack_timeout); /** * rdma_set_min_rnr_timer() - Set the minimum RNR Retry timer of the * QP associated with a connection identifier. * @id: Communication identifier to associated with service type. * @min_rnr_timer: 5-bit value encoded as Table 45: "Encoding for RNR NAK * Timer Field" in the IBTA specification. * * This function should be called before rdma_connect() on active * side, and on passive side before rdma_accept(). The timer value * will be associated with the local QP. When it receives a send it is * not read to handle, typically if the receive queue is empty, an RNR * Retry NAK is returned to the requester with the min_rnr_timer * encoded. The requester will then wait at least the time specified * in the NAK before retrying. The default is zero, which translates * to a minimum RNR Timer value of 655 ms. * * Return: 0 for success */ int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer) { struct rdma_id_private *id_priv; /* It is a five-bit value */ if (min_rnr_timer & 0xe0) return -EINVAL; if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT)) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->qp_mutex); id_priv->min_rnr_timer = min_rnr_timer; id_priv->min_rnr_timer_set = true; mutex_unlock(&id_priv->qp_mutex); return 0; } EXPORT_SYMBOL(rdma_set_min_rnr_timer); static int route_set_path_rec_inbound(struct cma_work *work, struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; if (!route->path_rec_inbound) { route->path_rec_inbound = kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL); if (!route->path_rec_inbound) return -ENOMEM; } *route->path_rec_inbound = *path_rec; return 0; } static int route_set_path_rec_outbound(struct cma_work *work, struct sa_path_rec *path_rec) { struct rdma_route *route = &work->id->id.route; if (!route->path_rec_outbound) { route->path_rec_outbound = kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL); if (!route->path_rec_outbound) return -ENOMEM; } *route->path_rec_outbound = *path_rec; return 0; } static void cma_query_handler(int status, struct sa_path_rec *path_rec, unsigned int num_prs, void *context) { struct cma_work *work = context; struct rdma_route *route; int i; route = &work->id->id.route; if (status) goto fail; for (i = 0; i < num_prs; i++) { if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP)) *route->path_rec = path_rec[i]; else if (path_rec[i].flags & IB_PATH_INBOUND) status = route_set_path_rec_inbound(work, &path_rec[i]); else if (path_rec[i].flags & IB_PATH_OUTBOUND) status = route_set_path_rec_outbound(work, &path_rec[i]); else status = -EINVAL; if (status) goto fail; } route->num_pri_alt_paths = 1; queue_work(cma_wq, &work->work); return; fail: work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_ERROR; work->event.status = status; pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n", status); queue_work(cma_wq, &work->work); } static int cma_query_ib_route(struct rdma_id_private *id_priv, unsigned long timeout_ms, struct cma_work *work) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sa_path_rec path_rec; ib_sa_comp_mask comp_mask; struct sockaddr_in6 *sin6; struct sockaddr_ib *sib; memset(&path_rec, 0, sizeof path_rec); if (rdma_cap_opa_ah(id_priv->id.device, id_priv->id.port_num)) path_rec.rec_type = SA_PATH_REC_TYPE_OPA; else path_rec.rec_type = SA_PATH_REC_TYPE_IB; rdma_addr_get_sgid(dev_addr, &path_rec.sgid); rdma_addr_get_dgid(dev_addr, &path_rec.dgid); path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); path_rec.numb_path = 1; path_rec.reversible = 1; path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID; switch (cma_family(id_priv)) { case AF_INET: path_rec.qos_class = cpu_to_be16((u16) id_priv->tos); comp_mask |= IB_SA_PATH_REC_QOS_CLASS; break; case AF_INET6: sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; case AF_IB: sib = (struct sockaddr_ib *) cma_src_addr(id_priv); path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20); comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS; break; } id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device, id_priv->id.port_num, &path_rec, comp_mask, timeout_ms, GFP_KERNEL, cma_query_handler, work, &id_priv->query); return (id_priv->query_id < 0) ? id_priv->query_id : 0; } static void cma_iboe_join_work_handler(struct work_struct *work) { struct cma_multicast *mc = container_of(work, struct cma_multicast, iboe_join.work); struct rdma_cm_event *event = &mc->iboe_join.event; struct rdma_id_private *id_priv = mc->id_priv; int ret; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; ret = cma_cm_event_handler(id_priv, event); WARN_ON(ret); out_unlock: mutex_unlock(&id_priv->handler_mutex); if (event->event == RDMA_CM_EVENT_MULTICAST_JOIN) rdma_destroy_ah_attr(&event->param.ud.ah_attr); } static void cma_work_handler(struct work_struct *_work) { struct cma_work *work = container_of(_work, struct cma_work, work); struct rdma_id_private *id_priv = work->id; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; if (work->old_state != 0 || work->new_state != 0) { if (!cma_comp_exch(id_priv, work->old_state, work->new_state)) goto out_unlock; } if (cma_cm_event_handler(id_priv, &work->event)) { cma_id_put(id_priv); destroy_id_handler_unlock(id_priv); goto out_free; } out_unlock: mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); out_free: if (work->event.event == RDMA_CM_EVENT_MULTICAST_JOIN) rdma_destroy_ah_attr(&work->event.param.ud.ah_attr); kfree(work); } static void cma_init_resolve_route_work(struct cma_work *work, struct rdma_id_private *id_priv) { work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ROUTE_QUERY; work->new_state = RDMA_CM_ROUTE_RESOLVED; work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; } static void enqueue_resolve_addr_work(struct cma_work *work, struct rdma_id_private *id_priv) { /* Balances with cma_id_put() in cma_work_handler */ cma_id_get(id_priv); work->id = id_priv; INIT_WORK(&work->work, cma_work_handler); work->old_state = RDMA_CM_ADDR_QUERY; work->new_state = RDMA_CM_ADDR_RESOLVED; work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; queue_work(cma_wq, &work->work); } static int cma_resolve_ib_route(struct rdma_id_private *id_priv, unsigned long timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; cma_init_resolve_route_work(work, id_priv); if (!route->path_rec) route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } ret = cma_query_ib_route(id_priv, timeout_ms, work); if (ret) goto err2; return 0; err2: kfree(route->path_rec); route->path_rec = NULL; err1: kfree(work); return ret; } static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, unsigned long supported_gids, enum ib_gid_type default_gid) { if ((network_type == RDMA_NETWORK_IPV4 || network_type == RDMA_NETWORK_IPV6) && test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) return IB_GID_TYPE_ROCE_UDP_ENCAP; return default_gid; } /* * cma_iboe_set_path_rec_l2_fields() is helper function which sets * path record type based on GID type. * It also sets up other L2 fields which includes destination mac address * netdev ifindex, of the path record. * It returns the netdev of the bound interface for this path record entry. */ static struct net_device * cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; struct rdma_addr *addr = &route->addr; unsigned long supported_gids; struct net_device *ndev; if (!addr->dev_addr.bound_dev_if) return NULL; ndev = dev_get_by_index(addr->dev_addr.net, addr->dev_addr.bound_dev_if); if (!ndev) return NULL; supported_gids = roce_gid_type_mask_support(id_priv->id.device, id_priv->id.port_num); gid_type = cma_route_gid_type(addr->dev_addr.network, supported_gids, id_priv->gid_type); /* Use the hint from IP Stack to select GID Type */ if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) gid_type = ib_network_to_gid_type(addr->dev_addr.network); route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); route->path_rec->roce.route_resolved = true; sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); return ndev; } int rdma_set_ib_path(struct rdma_cm_id *id, struct sa_path_rec *path_rec) { struct rdma_id_private *id_priv; struct net_device *ndev; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec), GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } if (rdma_protocol_roce(id->device, id->port_num)) { ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err_free; } dev_put(ndev); } id->route.num_pri_alt_paths = 1; return 0; err_free: kfree(id->route.path_rec); id->route.path_rec = NULL; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_set_ib_path); static int cma_resolve_iw_route(struct rdma_id_private *id_priv) { struct cma_work *work; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; } static int get_vlan_ndev_tc(struct net_device *vlan_ndev, int prio) { struct net_device *dev; dev = vlan_dev_real_dev(vlan_ndev); if (dev->num_tc) return netdev_get_prio_tc_map(dev, prio); return (vlan_dev_get_egress_qos_mask(vlan_ndev, prio) & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } struct iboe_prio_tc_map { int input_prio; int output_tc; bool found; }; static int get_lower_vlan_dev_tc(struct net_device *dev, struct netdev_nested_priv *priv) { struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data; if (is_vlan_dev(dev)) map->output_tc = get_vlan_ndev_tc(dev, map->input_prio); else if (dev->num_tc) map->output_tc = netdev_get_prio_tc_map(dev, map->input_prio); else map->output_tc = 0; /* We are interested only in first level VLAN device, so always * return 1 to stop iterating over next level devices. */ map->found = true; return 1; } static int iboe_tos_to_sl(struct net_device *ndev, int tos) { struct iboe_prio_tc_map prio_tc_map = {}; int prio = rt_tos2priority(tos); struct netdev_nested_priv priv; /* If VLAN device, get it directly from the VLAN netdev */ if (is_vlan_dev(ndev)) return get_vlan_ndev_tc(ndev, prio); prio_tc_map.input_prio = prio; priv.data = (void *)&prio_tc_map; rcu_read_lock(); netdev_walk_all_lower_dev_rcu(ndev, get_lower_vlan_dev_tc, &priv); rcu_read_unlock(); /* If map is found from lower device, use it; Otherwise * continue with the current netdevice to get priority to tc map. */ if (prio_tc_map.found) return prio_tc_map.output_tc; else if (ndev->num_tc) return netdev_get_prio_tc_map(ndev, prio); else return 0; } static __be32 cma_get_roce_udp_flow_label(struct rdma_id_private *id_priv) { struct sockaddr_in6 *addr6; u16 dport, sport; u32 hash, fl; addr6 = (struct sockaddr_in6 *)cma_src_addr(id_priv); fl = be32_to_cpu(addr6->sin6_flowinfo) & IB_GRH_FLOWLABEL_MASK; if ((cma_family(id_priv) != AF_INET6) || !fl) { dport = be16_to_cpu(cma_port(cma_dst_addr(id_priv))); sport = be16_to_cpu(cma_port(cma_src_addr(id_priv))); hash = (u32)sport * 31 + dport; fl = hash & IB_GRH_FLOWLABEL_MASK; } return cpu_to_be32(fl); } static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; struct net_device *ndev; u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; u8 tos; mutex_lock(&id_priv->qp_mutex); tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; mutex_unlock(&id_priv->qp_mutex); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; goto err1; } route->num_pri_alt_paths = 1; ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err2; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) /* TODO: get the hoplimit from the inet/inet6 device */ route->path_rec->hop_limit = addr->dev_addr.hoplimit; else route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; route->path_rec->sl = iboe_tos_to_sl(ndev, tos); route->path_rec->traffic_class = tos; route->path_rec->mtu = iboe_get_mtu(ndev->mtu); route->path_rec->rate_selector = IB_SA_EQ; route->path_rec->rate = IB_RATE_PORT_CURRENT; dev_put(ndev); route->path_rec->packet_life_time_selector = IB_SA_EQ; /* In case ACK timeout is set, use this value to calculate * PacketLifeTime. As per IBTA 12.7.34, * local ACK timeout = (2 * PacketLifeTime + Local CA’s ACK delay). * Assuming a negligible local ACK delay, we can use * PacketLifeTime = local ACK timeout/2 * as a reasonable approximation for RoCE networks. */ mutex_lock(&id_priv->qp_mutex); if (id_priv->timeout_set && id_priv->timeout) route->path_rec->packet_life_time = id_priv->timeout - 1; else route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME; mutex_unlock(&id_priv->qp_mutex); if (!route->path_rec->mtu) { ret = -EINVAL; goto err2; } if (rdma_protocol_roce_udp_encap(id_priv->id.device, id_priv->id.port_num)) route->path_rec->flow_label = cma_get_roce_udp_flow_label(id_priv); cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; err2: kfree(route->path_rec); route->path_rec = NULL; route->num_pri_alt_paths = 0; err1: kfree(work); return ret; } int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; if (!timeout_ms) return -EINVAL; id_priv = container_of(id, struct rdma_id_private, id); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY)) return -EINVAL; cma_id_get(id_priv); if (rdma_cap_ib_sa(id->device, id->port_num)) ret = cma_resolve_ib_route(id_priv, timeout_ms); else if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_resolve_iboe_route(id_priv); if (!ret) cma_add_id_to_tree(id_priv); } else if (rdma_protocol_iwarp(id->device, id->port_num)) ret = cma_resolve_iw_route(id_priv); else ret = -ENOSYS; if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED); cma_id_put(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_route); static void cma_set_loopback(struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK); break; case AF_INET6: ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr, 0, 0, 0, htonl(1)); break; default: ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr, 0, 0, 0, htonl(1)); break; } } static int cma_bind_loopback(struct rdma_id_private *id_priv) { struct cma_device *cma_dev, *cur_dev; union ib_gid gid; enum ib_port_state port_state; unsigned int p; u16 pkey; int ret; cma_dev = NULL; mutex_lock(&lock); list_for_each_entry(cur_dev, &dev_list, list) { if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cur_dev->device, 1)) continue; if (!cma_dev) cma_dev = cur_dev; rdma_for_each_port (cur_dev->device, p) { if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) && port_state == IB_PORT_ACTIVE) { cma_dev = cur_dev; goto port_found; } } } if (!cma_dev) { ret = -ENODEV; goto out; } p = 1; port_found: ret = rdma_query_gid(cma_dev->device, p, 0, &gid); if (ret) goto out; ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey); if (ret) goto out; id_priv->id.route.addr.dev_addr.dev_type = (rdma_protocol_ib(cma_dev->device, p)) ? ARPHRD_INFINIBAND : ARPHRD_ETHER; rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid); ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey); id_priv->id.port_num = p; cma_attach_to_dev(id_priv, cma_dev); rdma_restrack_add(&id_priv->res); cma_set_loopback(cma_src_addr(id_priv)); out: mutex_unlock(&lock); return ret; } static void addr_handler(int status, struct sockaddr *src_addr, struct rdma_dev_addr *dev_addr, void *context) { struct rdma_id_private *id_priv = context; struct rdma_cm_event event = {}; struct sockaddr *addr; struct sockaddr_storage old_addr; mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) goto out; /* * Store the previous src address, so that if we fail to acquire * matching rdma device, old address can be restored back, which helps * to cancel the cma listen operation correctly. */ addr = cma_src_addr(id_priv); memcpy(&old_addr, addr, rdma_addr_size(addr)); memcpy(addr, src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) { status = cma_acquire_dev_by_src_ip(id_priv); if (status) pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", status); rdma_restrack_add(&id_priv->res); } else if (status) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status); } if (status) { memcpy(addr, &old_addr, rdma_addr_size((struct sockaddr *)&old_addr)); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ADDR_BOUND)) goto out; event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = status; } else event.event = RDMA_CM_EVENT_ADDR_RESOLVED; if (cma_cm_event_handler(id_priv, &event)) { destroy_id_handler_unlock(id_priv); return; } out: mutex_unlock(&id_priv->handler_mutex); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) { struct cma_work *work; union ib_gid gid; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_bind_loopback(id_priv); if (ret) goto err; } rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) { struct cma_work *work; int ret; work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; if (!id_priv->cma_dev) { ret = cma_resolve_ib_dev(id_priv); if (ret) goto err; } rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); enqueue_resolve_addr_work(work, id_priv); return 0; err: kfree(work); return ret; } int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if ((reuse && id_priv->state != RDMA_CM_LISTEN) || id_priv->state == RDMA_CM_IDLE) { id_priv->reuseaddr = reuse; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_reuseaddr); int rdma_set_afonly(struct rdma_cm_id *id, int afonly) { struct rdma_id_private *id_priv; unsigned long flags; int ret; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irqsave(&id_priv->lock, flags); if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) { id_priv->options |= (1 << CMA_OPTION_AFONLY); id_priv->afonly = afonly; ret = 0; } else { ret = -EINVAL; } spin_unlock_irqrestore(&id_priv->lock, flags); return ret; } EXPORT_SYMBOL(rdma_set_afonly); static void cma_bind_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct sockaddr *addr; struct sockaddr_ib *sib; u64 sid, mask; __be16 port; lockdep_assert_held(&lock); addr = cma_src_addr(id_priv); port = htons(bind_list->port); switch (addr->sa_family) { case AF_INET: ((struct sockaddr_in *) addr)->sin_port = port; break; case AF_INET6: ((struct sockaddr_in6 *) addr)->sin6_port = port; break; case AF_IB: sib = (struct sockaddr_ib *) addr; sid = be64_to_cpu(sib->sib_sid); mask = be64_to_cpu(sib->sib_sid_mask); sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port)); sib->sib_sid_mask = cpu_to_be64(~0ULL); break; } id_priv->bind_list = bind_list; hlist_add_head(&id_priv->node, &bind_list->owners); } static int cma_alloc_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv, unsigned short snum) { struct rdma_bind_list *bind_list; int ret; lockdep_assert_held(&lock); bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list, snum); if (ret < 0) goto err; bind_list->ps = ps; bind_list->port = snum; cma_bind_port(bind_list, id_priv); return 0; err: kfree(bind_list); return ret == -ENOSPC ? -EADDRNOTAVAIL : ret; } static int cma_port_is_unique(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv) { struct rdma_id_private *cur_id; struct sockaddr *daddr = cma_dst_addr(id_priv); struct sockaddr *saddr = cma_src_addr(id_priv); __be16 dport = cma_port(daddr); lockdep_assert_held(&lock); hlist_for_each_entry(cur_id, &bind_list->owners, node) { struct sockaddr *cur_daddr = cma_dst_addr(cur_id); struct sockaddr *cur_saddr = cma_src_addr(cur_id); __be16 cur_dport = cma_port(cur_daddr); if (id_priv == cur_id) continue; /* different dest port -> unique */ if (!cma_any_port(daddr) && !cma_any_port(cur_daddr) && (dport != cur_dport)) continue; /* different src address -> unique */ if (!cma_any_addr(saddr) && !cma_any_addr(cur_saddr) && cma_addr_cmp(saddr, cur_saddr)) continue; /* different dst address -> unique */ if (!cma_any_addr(daddr) && !cma_any_addr(cur_daddr) && cma_addr_cmp(daddr, cur_daddr)) continue; return -EADDRNOTAVAIL; } return 0; } static int cma_alloc_any_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { static unsigned int last_used_port; int low, high, remaining; unsigned int rover; struct net *net = id_priv->id.route.addr.dev_addr.net; lockdep_assert_held(&lock); inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; rover = get_random_u32_inclusive(low, remaining + low - 1); retry: if (last_used_port != rover) { struct rdma_bind_list *bind_list; int ret; bind_list = cma_ps_find(net, ps, (unsigned short)rover); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, rover); } else { ret = cma_port_is_unique(bind_list, id_priv); if (!ret) cma_bind_port(bind_list, id_priv); } /* * Remember previously used port number in order to avoid * re-using same port immediately after it is closed. */ if (!ret) last_used_port = rover; if (ret != -EADDRNOTAVAIL) return ret; } if (--remaining) { rover++; if ((rover < low) || (rover > high)) rover = low; goto retry; } return -EADDRNOTAVAIL; } /* * Check that the requested port is available. This is called when trying to * bind to a specific port, or when trying to listen on a bound port. In * the latter case, the provided id_priv may already be on the bind_list, but * we still need to check that it's okay to start listening. */ static int cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *id_priv, uint8_t reuseaddr) { struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; lockdep_assert_held(&lock); addr = cma_src_addr(id_priv); hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) continue; if (reuseaddr && cur_id->reuseaddr) continue; cur_addr = cma_src_addr(cur_id); if (id_priv->afonly && cur_id->afonly && (addr->sa_family != cur_addr->sa_family)) continue; if (cma_any_addr(addr) || cma_any_addr(cur_addr)) return -EADDRNOTAVAIL; if (!cma_addr_cmp(addr, cur_addr)) return -EADDRINUSE; } return 0; } static int cma_use_port(enum rdma_ucm_port_space ps, struct rdma_id_private *id_priv) { struct rdma_bind_list *bind_list; unsigned short snum; int ret; lockdep_assert_held(&lock); snum = ntohs(cma_port(cma_src_addr(id_priv))); if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum); if (!bind_list) { ret = cma_alloc_port(ps, id_priv, snum); } else { ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr); if (!ret) cma_bind_port(bind_list, id_priv); } return ret; } static enum rdma_ucm_port_space cma_select_inet_ps(struct rdma_id_private *id_priv) { switch (id_priv->id.ps) { case RDMA_PS_TCP: case RDMA_PS_UDP: case RDMA_PS_IPOIB: case RDMA_PS_IB: return id_priv->id.ps; default: return 0; } } static enum rdma_ucm_port_space cma_select_ib_ps(struct rdma_id_private *id_priv) { enum rdma_ucm_port_space ps = 0; struct sockaddr_ib *sib; u64 sid_ps, mask, sid; sib = (struct sockaddr_ib *) cma_src_addr(id_priv); mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK; sid = be64_to_cpu(sib->sib_sid) & mask; if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) { sid_ps = RDMA_IB_IP_PS_IB; ps = RDMA_PS_IB; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) && (sid == (RDMA_IB_IP_PS_TCP & mask))) { sid_ps = RDMA_IB_IP_PS_TCP; ps = RDMA_PS_TCP; } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) && (sid == (RDMA_IB_IP_PS_UDP & mask))) { sid_ps = RDMA_IB_IP_PS_UDP; ps = RDMA_PS_UDP; } if (ps) { sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib))); sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK | be64_to_cpu(sib->sib_sid_mask)); } return ps; } static int cma_get_port(struct rdma_id_private *id_priv) { enum rdma_ucm_port_space ps; int ret; if (cma_family(id_priv) != AF_IB) ps = cma_select_inet_ps(id_priv); else ps = cma_select_ib_ps(id_priv); if (!ps) return -EPROTONOSUPPORT; mutex_lock(&lock); if (cma_any_port(cma_src_addr(id_priv))) ret = cma_alloc_any_port(ps, id_priv); else ret = cma_use_port(ps, id_priv); mutex_unlock(&lock); return ret; } static int cma_check_linklocal(struct rdma_dev_addr *dev_addr, struct sockaddr *addr) { #if IS_ENABLED(CONFIG_IPV6) struct sockaddr_in6 *sin6; if (addr->sa_family != AF_INET6) return 0; sin6 = (struct sockaddr_in6 *) addr; if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) return 0; if (!sin6->sin6_scope_id) return -EINVAL; dev_addr->bound_dev_if = sin6->sin6_scope_id; #endif return 0; } int rdma_listen(struct rdma_cm_id *id, int backlog) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) { struct sockaddr_in any_in = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), }; /* For a well behaved ULP state will be RDMA_CM_IDLE */ ret = rdma_bind_addr(id, (struct sockaddr *)&any_in); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN))) return -EINVAL; } /* * Once the ID reaches RDMA_CM_LISTEN it is not allowed to be reusable * any more, and has to be unique in the bind list. */ if (id_priv->reuseaddr) { mutex_lock(&lock); ret = cma_check_port(id_priv->bind_list, id_priv, 0); if (!ret) id_priv->reuseaddr = 0; mutex_unlock(&lock); if (ret) goto err; } id_priv->backlog = backlog; if (id_priv->cma_dev) { if (rdma_cap_ib_cm(id->device, 1)) { ret = cma_ib_listen(id_priv); if (ret) goto err; } else if (rdma_cap_iw_cm(id->device, 1)) { ret = cma_iw_listen(id_priv, backlog); if (ret) goto err; } else { ret = -ENOSYS; goto err; } } else { ret = cma_listen_on_all(id_priv); if (ret) goto err; } return 0; err: id_priv->backlog = 0; /* * All the failure paths that lead here will not allow the req_handler's * to have run. */ cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_listen); static int rdma_bind_addr_dst(struct rdma_id_private *id_priv, struct sockaddr *addr, const struct sockaddr *daddr) { struct sockaddr *id_daddr; int ret; if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 && addr->sa_family != AF_IB) return -EAFNOSUPPORT; if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND)) return -EINVAL; ret = cma_check_linklocal(&id_priv->id.route.addr.dev_addr, addr); if (ret) goto err1; memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr)); if (!cma_any_addr(addr)) { ret = cma_translate_addr(addr, &id_priv->id.route.addr.dev_addr); if (ret) goto err1; ret = cma_acquire_dev_by_src_ip(id_priv); if (ret) goto err1; } if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) { if (addr->sa_family == AF_INET) id_priv->afonly = 1; #if IS_ENABLED(CONFIG_IPV6) else if (addr->sa_family == AF_INET6) { struct net *net = id_priv->id.route.addr.dev_addr.net; id_priv->afonly = net->ipv6.sysctl.bindv6only; } #endif } id_daddr = cma_dst_addr(id_priv); if (daddr != id_daddr) memcpy(id_daddr, daddr, rdma_addr_size(addr)); id_daddr->sa_family = addr->sa_family; ret = cma_get_port(id_priv); if (ret) goto err2; if (!cma_any_addr(addr)) rdma_restrack_add(&id_priv->res); return 0; err2: if (id_priv->cma_dev) cma_release_dev(id_priv); err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; } static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, const struct sockaddr *dst_addr) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); struct sockaddr_storage zero_sock = {}; if (src_addr && src_addr->sa_family) return rdma_bind_addr_dst(id_priv, src_addr, dst_addr); /* * When the src_addr is not specified, automatically supply an any addr */ zero_sock.ss_family = dst_addr->sa_family; if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) { struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)&zero_sock; struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst_addr; src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id; if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL) id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id; } else if (dst_addr->sa_family == AF_IB) { ((struct sockaddr_ib *)&zero_sock)->sib_pkey = ((struct sockaddr_ib *)dst_addr)->sib_pkey; } return rdma_bind_addr_dst(id_priv, (struct sockaddr *)&zero_sock, dst_addr); } /* * If required, resolve the source address for bind and leave the id_priv in * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior * calls made by ULP, a previously bound ID will not be re-bound and src_addr is * ignored. */ static int resolve_prepare_src(struct rdma_id_private *id_priv, struct sockaddr *src_addr, const struct sockaddr *dst_addr) { int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { /* For a well behaved ULP state will be RDMA_CM_IDLE */ ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr); if (ret) return ret; if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY))) return -EINVAL; } else { memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); } if (cma_family(id_priv) != dst_addr->sa_family) { ret = -EINVAL; goto err_state; } return 0; err_state: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); return ret; } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, const struct sockaddr *dst_addr, unsigned long timeout_ms) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; ret = resolve_prepare_src(id_priv, src_addr, dst_addr); if (ret) return ret; if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); } else { if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { /* * The FSM can return back to RDMA_CM_ADDR_BOUND after * rdma_resolve_ip() is called, eg through the error * path in addr_handler(). If this happens the existing * request must be canceled before issuing a new one. * Since canceling a request is a bit slow and this * oddball path is rare, keep track once a request has * been issued. The track turns out to be a permanent * state since this is the only cancel as it is * immediately before rdma_resolve_ip(). */ if (id_priv->used_resolve_ip) rdma_addr_cancel(&id->route.addr.dev_addr); else id_priv->used_resolve_ip = 1; ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, false, id_priv); } } if (ret) goto err; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); return rdma_bind_addr_dst(id_priv, addr, cma_dst_addr(id_priv)); } EXPORT_SYMBOL(rdma_bind_addr); static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) { struct cma_hdr *cma_hdr; cma_hdr = hdr; cma_hdr->cma_version = CMA_VERSION; if (cma_family(id_priv) == AF_INET) { struct sockaddr_in *src4, *dst4; src4 = (struct sockaddr_in *) cma_src_addr(id_priv); dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv); cma_set_ip_ver(cma_hdr, 4); cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr; cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr; cma_hdr->port = src4->sin_port; } else if (cma_family(id_priv) == AF_INET6) { struct sockaddr_in6 *src6, *dst6; src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv); dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv); cma_set_ip_ver(cma_hdr, 6); cma_hdr->src_addr.ip6 = src6->sin6_addr; cma_hdr->dst_addr.ip6 = dst6->sin6_addr; cma_hdr->port = src6->sin6_port; } return 0; } static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; struct rdma_cm_event event = {}; const struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) goto out; switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; break; case IB_CM_SIDR_REP_RECEIVED: event.param.ud.private_data = ib_event->private_data; event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; if (rep->status != IB_SIDR_SUCCESS) { event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = ib_event->param.sidr_rep_rcvd.status; pr_debug_ratelimited("RDMA CM: UNREACHABLE: bad SIDR reply. status %d\n", event.status); break; } ret = cma_set_qkey(id_priv, rep->qkey); if (ret) { pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to set qkey. status %d\n", ret); event.event = RDMA_CM_EVENT_ADDR_ERROR; event.status = ret; break; } ib_init_ah_attr_from_path(id_priv->id.device, id_priv->id.port_num, id_priv->id.route.path_rec, &event.param.ud.ah_attr, rep->sgid_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; event.status = 0; break; default: pr_err("RDMA CMA: unexpected IB CM event: %d\n", ib_event->event); goto out; } ret = cma_cm_event_handler(id_priv, &event); rdma_destroy_ah_attr(&event.param.ud.ah_attr); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; destroy_id_handler_unlock(id_priv); return ret; } out: mutex_unlock(&id_priv->handler_mutex); return 0; } static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_sidr_req_param req; struct ib_cm_id *id; void *private_data; u8 offset; int ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; } else { private_data = NULL; } if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); if (private_data) { ret = cma_format_hdr(private_data, id_priv); if (ret) goto out; req.private_data = private_data; } id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; req.path = id_priv->id.route.path_rec; req.sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; trace_cm_send_sidr_req(id_priv); ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req); if (ret) { ib_destroy_cm_id(id_priv->cm_id.ib); id_priv->cm_id.ib = NULL; } out: kfree(private_data); return ret; } static int cma_connect_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_req_param req; struct rdma_route *route; void *private_data; struct ib_cm_id *id; u8 offset; int ret; memset(&req, 0, sizeof req); offset = cma_user_data_offset(id_priv); if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len)) return -EINVAL; if (req.private_data_len) { private_data = kzalloc(req.private_data_len, GFP_ATOMIC); if (!private_data) return -ENOMEM; } else { private_data = NULL; } if (conn_param->private_data && conn_param->private_data_len) memcpy(private_data + offset, conn_param->private_data, conn_param->private_data_len); id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv); if (IS_ERR(id)) { ret = PTR_ERR(id); goto out; } id_priv->cm_id.ib = id; route = &id_priv->id.route; if (private_data) { ret = cma_format_hdr(private_data, id_priv); if (ret) goto out; req.private_data = private_data; } req.primary_path = &route->path_rec[0]; req.primary_path_inbound = route->path_rec_inbound; req.primary_path_outbound = route->path_rec_outbound; if (route->num_pri_alt_paths == 2) req.alternate_path = &route->path_rec[1]; req.ppath_sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; /* Alternate path SGID attribute currently unsupported */ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.qp_num = id_priv->qp_num; req.qp_type = id_priv->id.qp_type; req.starting_psn = id_priv->seq_num; req.responder_resources = conn_param->responder_resources; req.initiator_depth = conn_param->initiator_depth; req.flow_control = conn_param->flow_control; req.retry_count = min_t(u8, 7, conn_param->retry_count); req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; req.ece.vendor_id = id_priv->ece.vendor_id; req.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_req(id_priv); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); out: if (ret && !IS_ERR(id)) { ib_destroy_cm_id(id); id_priv->cm_id.ib = NULL; } kfree(private_data); return ret; } static int cma_connect_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_id *cm_id; int ret; struct iw_cm_conn_param iw_param; cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv); if (IS_ERR(cm_id)) return PTR_ERR(cm_id); mutex_lock(&id_priv->qp_mutex); cm_id->tos = id_priv->tos; cm_id->tos_set = id_priv->tos_set; mutex_unlock(&id_priv->qp_mutex); id_priv->cm_id.iw = cm_id; memcpy(&cm_id->local_addr, cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv), rdma_addr_size(cma_dst_addr(id_priv))); ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; if (conn_param) { iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num; } else { memset(&iw_param, 0, sizeof iw_param); iw_param.qpn = id_priv->qp_num; } ret = iw_cm_connect(cm_id, &iw_param); out: if (ret) { iw_destroy_cm_id(cm_id); id_priv->cm_id.iw = NULL; } return ret; } /** * rdma_connect_locked - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Same as rdma_connect() but can only be called from the * RDMA_CM_EVENT_ROUTE_RESOLVED handler callback. */ int rdma_connect_locked(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT)) return -EINVAL; if (!id->qp) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) ret = cma_resolve_ib_udp(id_priv, conn_param); else ret = cma_connect_ib(id_priv, conn_param); } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_connect_iw(id_priv, conn_param); } else { ret = -ENOSYS; } if (ret) goto err_state; return 0; err_state: cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED); return ret; } EXPORT_SYMBOL(rdma_connect_locked); /** * rdma_connect - Initiate an active connection request. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * * Users must have resolved a route for the rdma_cm_id to connect with by having * called rdma_resolve_route before calling this routine. * * This call will either connect to a remote QP or obtain remote QP information * for unconnected rdma_cm_id's. The actual operation is based on the * rdma_cm_id's port space. */ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; mutex_lock(&id_priv->handler_mutex); ret = rdma_connect_locked(id, conn_param); mutex_unlock(&id_priv->handler_mutex); return ret; } EXPORT_SYMBOL(rdma_connect); /** * rdma_connect_ece - Initiate an active connection request with ECE data. * @id: Connection identifier to connect. * @conn_param: Connection information used for connected QPs. * @ece: ECE parameters * * See rdma_connect() explanation. */ int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, struct rdma_ucm_ece *ece) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); id_priv->ece.vendor_id = ece->vendor_id; id_priv->ece.attr_mod = ece->attr_mod; return rdma_connect(id, conn_param); } EXPORT_SYMBOL(rdma_connect_ece); static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct ib_cm_rep_param rep; int ret; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) goto out; ret = cma_modify_qp_rts(id_priv, conn_param); if (ret) goto out; memset(&rep, 0, sizeof rep); rep.qp_num = id_priv->qp_num; rep.starting_psn = id_priv->seq_num; rep.private_data = conn_param->private_data; rep.private_data_len = conn_param->private_data_len; rep.responder_resources = conn_param->responder_resources; rep.initiator_depth = conn_param->initiator_depth; rep.failover_accepted = 0; rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; rep.ece.vendor_id = id_priv->ece.vendor_id; rep.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_rep(id_priv); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); out: return ret; } static int cma_accept_iw(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { struct iw_cm_conn_param iw_param; int ret; if (!conn_param) return -EINVAL; ret = cma_modify_qp_rtr(id_priv, conn_param); if (ret) return ret; iw_param.ord = conn_param->initiator_depth; iw_param.ird = conn_param->responder_resources; iw_param.private_data = conn_param->private_data; iw_param.private_data_len = conn_param->private_data_len; if (id_priv->id.qp) iw_param.qpn = id_priv->qp_num; else iw_param.qpn = conn_param->qp_num; return iw_cm_accept(id_priv->cm_id.iw, &iw_param); } static int cma_send_sidr_rep(struct rdma_id_private *id_priv, enum ib_cm_sidr_status status, u32 qkey, const void *private_data, int private_data_len) { struct ib_cm_sidr_rep_param rep; int ret; memset(&rep, 0, sizeof rep); rep.status = status; if (status == IB_SIDR_SUCCESS) { if (qkey) ret = cma_set_qkey(id_priv, qkey); else ret = cma_set_default_qkey(id_priv); if (ret) return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; rep.ece.vendor_id = id_priv->ece.vendor_id; rep.ece.attr_mod = id_priv->ece.attr_mod; } rep.private_data = private_data; rep.private_data_len = private_data_len; trace_cm_send_sidr_rep(id_priv); return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep); } /** * rdma_accept - Called to accept a connection request or response. * @id: Connection identifier associated with the request. * @conn_param: Information needed to establish the connection. This must be * provided if accepting a connection request. If accepting a connection * response, this parameter must be NULL. * * Typically, this routine is only called by the listener to accept a connection * request. It must also be called on the active side of a connection if the * user is performing their own QP transitions. * * In the case of error, a reject message is sent to the remote side and the * state of the qp associated with the id is modified to error, such that any * previously posted receive buffers would be flushed. * * This function is for use by kernel ULPs and must be called from under the * handler callback. */ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); int ret; lockdep_assert_held(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT) return -EINVAL; if (!id->qp && conn_param) { id_priv->qp_num = conn_param->qp_num; id_priv->srq = conn_param->srq; } if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { if (conn_param) ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, conn_param->qkey, conn_param->private_data, conn_param->private_data_len); else ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS, 0, NULL, 0); } else { if (conn_param) ret = cma_accept_ib(id_priv, conn_param); else ret = cma_rep_recv(id_priv); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = cma_accept_iw(id_priv, conn_param); } else { ret = -ENOSYS; } if (ret) goto reject; return 0; reject: cma_modify_qp_err(id_priv); rdma_reject(id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); return ret; } EXPORT_SYMBOL(rdma_accept); int rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, struct rdma_ucm_ece *ece) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); id_priv->ece.vendor_id = ece->vendor_id; id_priv->ece.attr_mod = ece->attr_mod; return rdma_accept(id, conn_param); } EXPORT_SYMBOL(rdma_accept_ece); void rdma_lock_handler(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_lock(&id_priv->handler_mutex); } EXPORT_SYMBOL(rdma_lock_handler); void rdma_unlock_handler(struct rdma_cm_id *id) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); mutex_unlock(&id_priv->handler_mutex); } EXPORT_SYMBOL(rdma_unlock_handler); int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; switch (id->device->node_type) { case RDMA_NODE_IB_CA: ret = ib_cm_notify(id_priv->cm_id.ib, event); break; default: ret = 0; break; } return ret; } EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, u8 private_data_len, u8 reason) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { if (id->qp_type == IB_QPT_UD) { ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0, private_data, private_data_len); } else { trace_cm_send_rej(id_priv); ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0, private_data, private_data_len); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, private_data, private_data_len); } else { ret = -ENOSYS; } return ret; } EXPORT_SYMBOL(rdma_reject); int rdma_disconnect(struct rdma_cm_id *id) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); if (!id_priv->cm_id.ib) return -EINVAL; if (rdma_cap_ib_cm(id->device, id->port_num)) { ret = cma_modify_qp_err(id_priv); if (ret) goto out; /* Initiate or respond to a disconnect. */ trace_cm_disconnect(id_priv); if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) { if (!ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0)) trace_cm_sent_drep(id_priv); } else { trace_cm_sent_dreq(id_priv); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_disconnect(id_priv->cm_id.iw, 0); } else ret = -EINVAL; out: return ret; } EXPORT_SYMBOL(rdma_disconnect); static void cma_make_mc_event(int status, struct rdma_id_private *id_priv, struct ib_sa_multicast *multicast, struct rdma_cm_event *event, struct cma_multicast *mc) { struct rdma_dev_addr *dev_addr; enum ib_gid_type gid_type; struct net_device *ndev; if (status) pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n", status); event->status = status; event->param.ud.private_data = mc->context; if (status) { event->event = RDMA_CM_EVENT_MULTICAST_ERROR; return; } dev_addr = &id_priv->id.route.addr.dev_addr; ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); gid_type = id_priv->cma_dev ->default_gid_type[id_priv->id.port_num - rdma_start_port( id_priv->cma_dev->device)]; event->event = RDMA_CM_EVENT_MULTICAST_JOIN; if (ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, ndev, gid_type, &event->param.ud.ah_attr)) { event->event = RDMA_CM_EVENT_MULTICAST_ERROR; goto out; } event->param.ud.qp_num = 0xFFFFFF; event->param.ud.qkey = id_priv->qkey; out: dev_put(ndev); } static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) { struct cma_multicast *mc = multicast->context; struct rdma_id_private *id_priv = mc->id_priv; struct rdma_cm_event event = {}; int ret = 0; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL || READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING) goto out; ret = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); if (!ret) { cma_make_mc_event(status, id_priv, multicast, &event, mc); ret = cma_cm_event_handler(id_priv, &event); } rdma_destroy_ah_attr(&event.param.ud.ah_attr); WARN_ON(ret); out: mutex_unlock(&id_priv->handler_mutex); return 0; } static void cma_set_mgid(struct rdma_id_private *id_priv, struct sockaddr *addr, union ib_gid *mgid) { unsigned char mc_map[MAX_ADDR_LEN]; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sockaddr_in *sin = (struct sockaddr_in *) addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if ((addr->sa_family == AF_INET6) && ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) == 0xFF10A01B)) { /* IPv6 address is an SA assigned MGID. */ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else if (addr->sa_family == AF_IB) { memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } else { ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ *mgid = *(union ib_gid *) (mc_map + 4); } } static int cma_join_ib_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct ib_sa_mcmember_rec rec; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; ib_sa_comp_mask comp_mask; int ret; ib_addr_get_mgid(dev_addr, &rec.mgid); ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num, &rec.mgid, &rec); if (ret) return ret; if (!id_priv->qkey) { ret = cma_set_default_qkey(id_priv); if (ret) return ret; } cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid); rec.qkey = cpu_to_be32(id_priv->qkey); rdma_addr_get_sgid(dev_addr, &rec.port_gid); rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); rec.join_state = mc->join_state; comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE | IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL | IB_SA_MCMEMBER_REC_FLOW_LABEL | IB_SA_MCMEMBER_REC_TRAFFIC_CLASS; if (id_priv->id.ps == RDMA_PS_IPOIB) comp_mask |= IB_SA_MCMEMBER_REC_RATE | IB_SA_MCMEMBER_REC_RATE_SELECTOR | IB_SA_MCMEMBER_REC_MTU_SELECTOR | IB_SA_MCMEMBER_REC_MTU | IB_SA_MCMEMBER_REC_HOP_LIMIT; mc->sa_mc = ib_sa_join_multicast(&sa_client, id_priv->id.device, id_priv->id.port_num, &rec, comp_mask, GFP_KERNEL, cma_ib_mc_handler, mc); return PTR_ERR_OR_ZERO(mc->sa_mc); } static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, enum ib_gid_type gid_type) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; if (cma_any_addr(addr)) { memset(mgid, 0, sizeof *mgid); } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { mgid->raw[0] = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff; mgid->raw[1] = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; mgid->raw[5] = 0; mgid->raw[6] = 0; mgid->raw[7] = 0; mgid->raw[8] = 0; mgid->raw[9] = 0; mgid->raw[10] = 0xff; mgid->raw[11] = 0xff; *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr; } } static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, struct cma_multicast *mc) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; struct ib_sa_multicast ib; enum ib_gid_type gid_type; bool send_only; send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN); if (cma_zero_addr(addr)) return -EINVAL; gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type); ib.rec.pkey = cpu_to_be16(0xffff); if (dev_addr->bound_dev_if) ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!ndev) return -ENODEV; ib.rec.rate = IB_RATE_PORT_CURRENT; ib.rec.hop_limit = 1; ib.rec.mtu = iboe_get_mtu(ndev->mtu); if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { ib.rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; if (!send_only) { err = cma_igmp_send(ndev, &ib.rec.mgid, true); } } } else { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) err = -ENOTSUPP; } dev_put(ndev); if (err || !ib.rec.mtu) return err ?: -EINVAL; if (!id_priv->qkey) cma_set_default_qkey(id_priv); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &ib.rec.port_gid); INIT_WORK(&mc->iboe_join.work, cma_iboe_join_work_handler); cma_make_mc_event(0, id_priv, &ib, &mc->iboe_join.event, mc); queue_work(cma_wq, &mc->iboe_join.work); return 0; } int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, u8 join_state, void *context) { struct rdma_id_private *id_priv = container_of(id, struct rdma_id_private, id); struct cma_multicast *mc; int ret; /* Not supported for kernel QPs */ if (WARN_ON(id->qp)) return -EINVAL; /* ULP is calling this wrong. */ if (!id->device || (READ_ONCE(id_priv->state) != RDMA_CM_ADDR_BOUND && READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED)) return -EINVAL; if (id_priv->id.qp_type != IB_QPT_UD) return -EINVAL; mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) return -ENOMEM; memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; mc->join_state = join_state; if (rdma_protocol_roce(id->device, id->port_num)) { ret = cma_iboe_join_multicast(id_priv, mc); if (ret) goto out_err; } else if (rdma_cap_ib_mcast(id->device, id->port_num)) { ret = cma_join_ib_multicast(id_priv, mc); if (ret) goto out_err; } else { ret = -ENOSYS; goto out_err; } spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); return 0; out_err: kfree(mc); return ret; } EXPORT_SYMBOL(rdma_join_multicast); void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) { struct rdma_id_private *id_priv; struct cma_multicast *mc; id_priv = container_of(id, struct rdma_id_private, id); spin_lock_irq(&id_priv->lock); list_for_each_entry(mc, &id_priv->mc_list, list) { if (memcmp(&mc->addr, addr, rdma_addr_size(addr)) != 0) continue; list_del(&mc->list); spin_unlock_irq(&id_priv->lock); WARN_ON(id_priv->cma_dev->device != id->device); destroy_mc(id_priv, mc); return; } spin_unlock_irq(&id_priv->lock); } EXPORT_SYMBOL(rdma_leave_multicast); static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr; struct cma_work *work; dev_addr = &id_priv->id.route.addr.dev_addr; if ((dev_addr->bound_dev_if == ndev->ifindex) && (net_eq(dev_net(ndev), dev_addr->net)) && memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) { pr_info("RDMA CM addr change for ndev %s used by id %p\n", ndev->name, &id_priv->id); work = kzalloc(sizeof *work, GFP_KERNEL); if (!work) return -ENOMEM; INIT_WORK(&work->work, cma_work_handler); work->id = id_priv; work->event.event = RDMA_CM_EVENT_ADDR_CHANGE; cma_id_get(id_priv); queue_work(cma_wq, &work->work); } return 0; } static int cma_netdev_callback(struct notifier_block *self, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); struct cma_device *cma_dev; struct rdma_id_private *id_priv; int ret = NOTIFY_DONE; if (event != NETDEV_BONDING_FAILOVER) return NOTIFY_DONE; if (!netif_is_bond_master(ndev)) return NOTIFY_DONE; mutex_lock(&lock); list_for_each_entry(cma_dev, &dev_list, list) list_for_each_entry(id_priv, &cma_dev->id_list, device_item) { ret = cma_netdev_change(ndev, id_priv); if (ret) goto out; } out: mutex_unlock(&lock); return ret; } static void cma_netevent_work_handler(struct work_struct *_work) { struct rdma_id_private *id_priv = container_of(_work, struct rdma_id_private, id.net_work); struct rdma_cm_event event = {}; mutex_lock(&id_priv->handler_mutex); if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING || READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL) goto out_unlock; event.event = RDMA_CM_EVENT_UNREACHABLE; event.status = -ETIMEDOUT; if (cma_cm_event_handler(id_priv, &event)) { __acquire(&id_priv->handler_mutex); id_priv->cm_id.ib = NULL; cma_id_put(id_priv); destroy_id_handler_unlock(id_priv); return; } out_unlock: mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); } static int cma_netevent_callback(struct notifier_block *self, unsigned long event, void *ctx) { struct id_table_entry *ips_node = NULL; struct rdma_id_private *current_id; struct neighbour *neigh = ctx; unsigned long flags; if (event != NETEVENT_NEIGH_UPDATE) return NOTIFY_DONE; spin_lock_irqsave(&id_table_lock, flags); if (neigh->tbl->family == AF_INET6) { struct sockaddr_in6 neigh_sock_6; neigh_sock_6.sin6_family = AF_INET6; neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key; ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, (struct sockaddr *)&neigh_sock_6); } else if (neigh->tbl->family == AF_INET) { struct sockaddr_in neigh_sock_4; neigh_sock_4.sin_family = AF_INET; neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key); ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex, (struct sockaddr *)&neigh_sock_4); } else goto out; if (!ips_node) goto out; list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) { if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr, neigh->ha, ETH_ALEN)) continue; INIT_WORK(&current_id->id.net_work, cma_netevent_work_handler); cma_id_get(current_id); queue_work(cma_wq, &current_id->id.net_work); } out: spin_unlock_irqrestore(&id_table_lock, flags); return NOTIFY_DONE; } static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; static struct notifier_block cma_netevent_cb = { .notifier_call = cma_netevent_callback }; static void cma_send_device_removal_put(struct rdma_id_private *id_priv) { struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL }; enum rdma_cm_state state; unsigned long flags; mutex_lock(&id_priv->handler_mutex); /* Record that we want to remove the device */ spin_lock_irqsave(&id_priv->lock, flags); state = id_priv->state; if (state == RDMA_CM_DESTROYING || state == RDMA_CM_DEVICE_REMOVAL) { spin_unlock_irqrestore(&id_priv->lock, flags); mutex_unlock(&id_priv->handler_mutex); cma_id_put(id_priv); return; } id_priv->state = RDMA_CM_DEVICE_REMOVAL; spin_unlock_irqrestore(&id_priv->lock, flags); if (cma_cm_event_handler(id_priv, &event)) { /* * At this point the ULP promises it won't call * rdma_destroy_id() concurrently */ cma_id_put(id_priv); mutex_unlock(&id_priv->handler_mutex); trace_cm_id_destroy(id_priv); _destroy_id(id_priv, state); return; } mutex_unlock(&id_priv->handler_mutex); /* * If this races with destroy then the thread that first assigns state * to a destroying does the cancel. */ cma_cancel_operation(id_priv, state); cma_id_put(id_priv); } static void cma_process_remove(struct cma_device *cma_dev) { mutex_lock(&lock); while (!list_empty(&cma_dev->id_list)) { struct rdma_id_private *id_priv = list_first_entry( &cma_dev->id_list, struct rdma_id_private, device_item); list_del_init(&id_priv->listen_item); list_del_init(&id_priv->device_item); cma_id_get(id_priv); mutex_unlock(&lock); cma_send_device_removal_put(id_priv); mutex_lock(&lock); } mutex_unlock(&lock); cma_dev_put(cma_dev); wait_for_completion(&cma_dev->comp); } static bool cma_supported(struct ib_device *device) { u32 i; rdma_for_each_port(device, i) { if (rdma_cap_ib_cm(device, i) || rdma_cap_iw_cm(device, i)) return true; } return false; } static int cma_add_one(struct ib_device *device) { struct rdma_id_private *to_destroy; struct cma_device *cma_dev; struct rdma_id_private *id_priv; unsigned long supported_gids = 0; int ret; u32 i; if (!cma_supported(device)) return -EOPNOTSUPP; cma_dev = kmalloc(sizeof(*cma_dev), GFP_KERNEL); if (!cma_dev) return -ENOMEM; cma_dev->device = device; cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_gid_type), GFP_KERNEL); if (!cma_dev->default_gid_type) { ret = -ENOMEM; goto free_cma_dev; } cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_roce_tos), GFP_KERNEL); if (!cma_dev->default_roce_tos) { ret = -ENOMEM; goto free_gid_type; } rdma_for_each_port (device, i) { supported_gids = roce_gid_type_mask_support(device, i); WARN_ON(!supported_gids); if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE)) cma_dev->default_gid_type[i - rdma_start_port(device)] = CMA_PREFERRED_ROCE_GID_TYPE; else cma_dev->default_gid_type[i - rdma_start_port(device)] = find_first_bit(&supported_gids, BITS_PER_LONG); cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0; } init_completion(&cma_dev->comp); refcount_set(&cma_dev->refcount, 1); INIT_LIST_HEAD(&cma_dev->id_list); ib_set_client_data(device, &cma_client, cma_dev); mutex_lock(&lock); list_add_tail(&cma_dev->list, &dev_list); list_for_each_entry(id_priv, &listen_any_list, listen_any_item) { ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy); if (ret) goto free_listen; } mutex_unlock(&lock); trace_cm_add_one(device); return 0; free_listen: list_del(&cma_dev->list); mutex_unlock(&lock); /* cma_process_remove() will delete to_destroy */ cma_process_remove(cma_dev); kfree(cma_dev->default_roce_tos); free_gid_type: kfree(cma_dev->default_gid_type); free_cma_dev: kfree(cma_dev); return ret; } static void cma_remove_one(struct ib_device *device, void *client_data) { struct cma_device *cma_dev = client_data; trace_cm_remove_one(device); mutex_lock(&lock); list_del(&cma_dev->list); mutex_unlock(&lock); cma_process_remove(cma_dev); kfree(cma_dev->default_roce_tos); kfree(cma_dev->default_gid_type); kfree(cma_dev); } static int cma_init_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); xa_init(&pernet->tcp_ps); xa_init(&pernet->udp_ps); xa_init(&pernet->ipoib_ps); xa_init(&pernet->ib_ps); return 0; } static void cma_exit_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); WARN_ON(!xa_empty(&pernet->tcp_ps)); WARN_ON(!xa_empty(&pernet->udp_ps)); WARN_ON(!xa_empty(&pernet->ipoib_ps)); WARN_ON(!xa_empty(&pernet->ib_ps)); } static struct pernet_operations cma_pernet_operations = { .init = cma_init_net, .exit = cma_exit_net, .id = &cma_pernet_id, .size = sizeof(struct cma_pernet), }; static int __init cma_init(void) { int ret; /* * There is a rare lock ordering dependency in cma_netdev_callback() * that only happens when bonding is enabled. Teach lockdep that rtnl * must never be nested under lock so it can find these without having * to test with bonding. */ if (IS_ENABLED(CONFIG_LOCKDEP)) { rtnl_lock(); mutex_lock(&lock); mutex_unlock(&lock); rtnl_unlock(); } cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM); if (!cma_wq) return -ENOMEM; ret = register_pernet_subsys(&cma_pernet_operations); if (ret) goto err_wq; ib_sa_register_client(&sa_client); register_netdevice_notifier(&cma_nb); register_netevent_notifier(&cma_netevent_cb); ret = ib_register_client(&cma_client); if (ret) goto err; ret = cma_configfs_init(); if (ret) goto err_ib; return 0; err_ib: ib_unregister_client(&cma_client); err: unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); err_wq: destroy_workqueue(cma_wq); return ret; } static void __exit cma_cleanup(void) { cma_configfs_exit(); ib_unregister_client(&cma_client); unregister_netevent_notifier(&cma_netevent_cb); unregister_netdevice_notifier(&cma_nb); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); destroy_workqueue(cma_wq); } module_init(cma_init); module_exit(cma_cleanup);
260 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 // SPDX-License-Identifier: GPL-2.0-only /* Expectation handling for nf_conntrack. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> * (c) 2005-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/netfilter.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/percpu.h> #include <linux/kernel.h> #include <linux/siphash.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/netns/hash.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> unsigned int nf_ct_expect_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); struct hlist_head *nf_ct_expect_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hash); unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; static siphash_aligned_key_t nf_ct_expect_hashrnd; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); struct nf_conntrack_net *cnet; WARN_ON(!master_help); WARN_ON(timer_pending(&exp->timeout)); hlist_del_rcu(&exp->hnode); cnet = nf_ct_pernet(net); cnet->expect_count--; hlist_del_rcu(&exp->lnode); master_help->expecting[exp->class]--; nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report); nf_ct_expect_put(exp); NF_CT_STAT_INC(net, expect_delete); } EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); static void nf_ct_expectation_timed_out(struct timer_list *t) { struct nf_conntrack_expect *exp = from_timer(exp, t, timeout); spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_unlink_expect(exp); spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_put(exp); } static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { struct { union nf_inet_addr dst_addr; u32 net_mix; u16 dport; u8 l3num; u8 protonum; } __aligned(SIPHASH_ALIGNMENT) combined; u32 hash; get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); memset(&combined, 0, sizeof(combined)); combined.dst_addr = tuple->dst.u3; combined.net_mix = net_hash_mix(n); combined.dport = (__force __u16)tuple->dst.u.all; combined.l3num = tuple->src.l3num; combined.protonum = tuple->dst.protonum; hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd); return reciprocal_scale(hash, nf_ct_expect_hsize); } static bool nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_expect *i, const struct nf_conntrack_zone *zone, const struct net *net) { return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && net_eq(net, nf_ct_net(i->master)) && nf_ct_zone_equal_any(i->master, zone); } bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) { if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); return true; } return false; } EXPORT_SYMBOL_GPL(nf_ct_remove_expect); struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i; unsigned int h; if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) { if (nf_ct_exp_equal(tuple, i, zone, net)) return i; } return NULL; } EXPORT_SYMBOL_GPL(__nf_ct_expect_find); /* Just find a expectation corresponding to a tuple. */ struct nf_conntrack_expect * nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; rcu_read_lock(); i = __nf_ct_expect_find(net, zone, tuple); if (i && !refcount_inc_not_zero(&i->use)) i = NULL; rcu_read_unlock(); return i; } EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); /* If an expectation for this connection is found, it gets delete from * global list then returned. */ struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, bool unlink) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; unsigned int h; if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); hlist_for_each_entry(i, &nf_ct_expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_exp_equal(tuple, i, zone, net)) { exp = i; break; } } if (!exp) return NULL; /* If master is not in hash table yet (ie. packet hasn't left this machine yet), how can other end know about expected? Hence these are not the droids you are looking for (if master ct never got confirmed, we'd hold a reference to it and weird things would happen to future packets). */ if (!nf_ct_is_confirmed(exp->master)) return NULL; /* Avoid race with other CPUs, that for exp->master ct, is * about to invoke ->destroy(), or nf_ct_delete() via timeout * or early_drop(). * * The refcount_inc_not_zero() check tells: If that fails, we * know that the ct is being destroyed. If it succeeds, we * can be sure the ct cannot disappear underneath. */ if (unlikely(nf_ct_is_dying(exp->master) || !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) { refcount_inc(&exp->use); return exp; } else if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); return exp; } /* Undo exp->master refcnt increase, if del_timer() failed */ nf_ct_put(exp->master); return NULL; } /* delete all expectations for this conntrack */ void nf_ct_remove_expectations(struct nf_conn *ct) { struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_expect *exp; struct hlist_node *next; /* Optimization: most connection never expect any others. */ if (!help) return; spin_lock_bh(&nf_conntrack_expect_lock); hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { nf_ct_remove_expect(exp); } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_remove_expectations); /* Would two expected things clash? */ static inline int expect_clash(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { /* Part covered by intersection of masks must be unequal, otherwise they clash */ struct nf_conntrack_tuple_mask intersect_mask; int count; intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all; for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ intersect_mask.src.u3.all[count] = a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; } return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } static inline int expect_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { return nf_ct_tuple_equal(&a->tuple, &b->tuple) && nf_ct_tuple_mask_equal(&a->mask, &b->mask) && net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } static bool master_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b, unsigned int flags) { if (flags & NF_CT_EXP_F_SKIP_MASTER) return true; return a->master == b->master; } /* Generally a bad idea to call this: could have matched already. */ void nf_ct_unexpect_related(struct nf_conntrack_expect *exp) { spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_remove_expect(exp); spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); /* We don't increase the master conntrack refcount for non-fulfilled * conntracks. During the conntrack destruction, the expectations are * always killed before the conntrack itself */ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) { struct nf_conntrack_expect *new; new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC); if (!new) return NULL; new->master = me; refcount_set(&new->use, 1); return new; } EXPORT_SYMBOL_GPL(nf_ct_expect_alloc); void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, u_int8_t family, const union nf_inet_addr *saddr, const union nf_inet_addr *daddr, u_int8_t proto, const __be16 *src, const __be16 *dst) { int len; if (family == AF_INET) len = 4; else len = 16; exp->flags = 0; exp->class = class; exp->expectfn = NULL; exp->helper = NULL; exp->tuple.src.l3num = family; exp->tuple.dst.protonum = proto; if (saddr) { memcpy(&exp->tuple.src.u3, saddr, len); if (sizeof(exp->tuple.src.u3) > len) /* address needs to be cleared for nf_ct_tuple_equal */ memset((void *)&exp->tuple.src.u3 + len, 0x00, sizeof(exp->tuple.src.u3) - len); memset(&exp->mask.src.u3, 0xFF, len); if (sizeof(exp->mask.src.u3) > len) memset((void *)&exp->mask.src.u3 + len, 0x00, sizeof(exp->mask.src.u3) - len); } else { memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3)); memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3)); } if (src) { exp->tuple.src.u.all = *src; exp->mask.src.u.all = htons(0xFFFF); } else { exp->tuple.src.u.all = 0; exp->mask.src.u.all = 0; } memcpy(&exp->tuple.dst.u3, daddr, len); if (sizeof(exp->tuple.dst.u3) > len) /* address needs to be cleared for nf_ct_tuple_equal */ memset((void *)&exp->tuple.dst.u3 + len, 0x00, sizeof(exp->tuple.dst.u3) - len); exp->tuple.dst.u.all = *dst; #if IS_ENABLED(CONFIG_NF_NAT) memset(&exp->saved_addr, 0, sizeof(exp->saved_addr)); memset(&exp->saved_proto, 0, sizeof(exp->saved_proto)); #endif } EXPORT_SYMBOL_GPL(nf_ct_expect_init); static void nf_ct_expect_free_rcu(struct rcu_head *head) { struct nf_conntrack_expect *exp; exp = container_of(head, struct nf_conntrack_expect, rcu); kmem_cache_free(nf_ct_expect_cachep, exp); } void nf_ct_expect_put(struct nf_conntrack_expect *exp) { if (refcount_dec_and_test(&exp->use)) call_rcu(&exp->rcu, nf_ct_expect_free_rcu); } EXPORT_SYMBOL_GPL(nf_ct_expect_put); static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) { struct nf_conntrack_net *cnet; struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(exp); unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple); /* two references : one for hash insert, one for the timer */ refcount_add(2, &exp->use); timer_setup(&exp->timeout, nf_ct_expectation_timed_out, 0); helper = rcu_dereference_protected(master_help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { exp->timeout.expires = jiffies + helper->expect_policy[exp->class].timeout * HZ; } add_timer(&exp->timeout); hlist_add_head_rcu(&exp->lnode, &master_help->expectations); master_help->expecting[exp->class]++; hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); cnet = nf_ct_pernet(net); cnet->expect_count++; NF_CT_STAT_INC(net, expect_create); } /* Race with expectations being used means we could have none to find; OK. */ static void evict_oldest_expect(struct nf_conn *master, struct nf_conntrack_expect *new) { struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_expect *exp, *last = NULL; hlist_for_each_entry(exp, &master_help->expectations, lnode) { if (exp->class == new->class) last = exp; } if (last) nf_ct_remove_expect(last); } static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, unsigned int flags) { const struct nf_conntrack_expect_policy *p; struct nf_conntrack_expect *i; struct nf_conntrack_net *cnet; struct nf_conn *master = expect->master; struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(expect); struct hlist_node *next; unsigned int h; int ret = 0; if (!master_help) { ret = -ESHUTDOWN; goto out; } h = nf_ct_expect_dst_hash(net, &expect->tuple); hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { if (master_matches(i, expect, flags) && expect_matches(i, expect)) { if (i->class != expect->class || i->master != expect->master) return -EALREADY; if (nf_ct_remove_expect(i)) break; } else if (expect_clash(i, expect)) { ret = -EBUSY; goto out; } } /* Will be over limit? */ helper = rcu_dereference_protected(master_help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { p = &helper->expect_policy[expect->class]; if (p->max_expected && master_help->expecting[expect->class] >= p->max_expected) { evict_oldest_expect(master, expect); if (master_help->expecting[expect->class] >= p->max_expected) { ret = -EMFILE; goto out; } } } cnet = nf_ct_pernet(net); if (cnet->expect_count >= nf_ct_expect_max) { net_warn_ratelimited("nf_conntrack: expectation table full\n"); ret = -EMFILE; } out: return ret; } int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, u32 portid, int report, unsigned int flags) { int ret; spin_lock_bh(&nf_conntrack_expect_lock); ret = __nf_ct_expect_check(expect, flags); if (ret < 0) goto out; nf_ct_expect_insert(expect); spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); return 0; out: spin_unlock_bh(&nf_conntrack_expect_lock); return ret; } EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data) { struct nf_conntrack_expect *exp; const struct hlist_node *next; unsigned int i; spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { if (iter(exp, data) && del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } } } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_destroy); void nf_ct_expect_iterate_net(struct net *net, bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data, u32 portid, int report) { struct nf_conntrack_expect *exp; const struct hlist_node *next; unsigned int i; spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { if (!net_eq(nf_ct_exp_net(exp), net)) continue; if (iter(exp, data) && del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, portid, report); nf_ct_expect_put(exp); } } } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_net); #ifdef CONFIG_NF_CONNTRACK_PROCFS struct ct_expect_iter_state { struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { n = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); if (n) return n; } return NULL; } static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; head = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); } return head; } static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos) { struct hlist_node *head = ct_expect_get_first(seq); if (head) while (pos && (head = ct_expect_get_next(seq, head))) pos--; return pos ? NULL : head; } static void *exp_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return ct_expect_get_idx(seq, *pos); } static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return ct_expect_get_next(seq, v); } static void exp_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int exp_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_expect *expect; struct nf_conntrack_helper *helper; struct hlist_node *n = v; char *delim = ""; expect = hlist_entry(n, struct nf_conntrack_expect, hnode); if (expect->timeout.function) seq_printf(s, "%ld ", timer_pending(&expect->timeout) ? (long)(expect->timeout.expires - jiffies)/HZ : 0); else seq_puts(s, "- "); seq_printf(s, "l3proto = %u proto=%u ", expect->tuple.src.l3num, expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, nf_ct_l4proto_find(expect->tuple.dst.protonum)); if (expect->flags & NF_CT_EXPECT_PERMANENT) { seq_puts(s, "PERMANENT"); delim = ","; } if (expect->flags & NF_CT_EXPECT_INACTIVE) { seq_printf(s, "%sINACTIVE", delim); delim = ","; } if (expect->flags & NF_CT_EXPECT_USERSPACE) seq_printf(s, "%sUSERSPACE", delim); helper = rcu_dereference(nfct_help(expect->master)->helper); if (helper) { seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name); if (helper->expect_policy[expect->class].name[0]) seq_printf(s, "/%s", helper->expect_policy[expect->class].name); } seq_putc(s, '\n'); return 0; } static const struct seq_operations exp_seq_ops = { .start = exp_seq_start, .next = exp_seq_next, .stop = exp_seq_stop, .show = exp_seq_show }; #endif /* CONFIG_NF_CONNTRACK_PROCFS */ static int exp_proc_init(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_PROCFS struct proc_dir_entry *proc; kuid_t root_uid; kgid_t root_gid; proc = proc_create_net("nf_conntrack_expect", 0440, net->proc_net, &exp_seq_ops, sizeof(struct ct_expect_iter_state)); if (!proc) return -ENOMEM; root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); #endif /* CONFIG_NF_CONNTRACK_PROCFS */ return 0; } static void exp_proc_remove(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_PROCFS remove_proc_entry("nf_conntrack_expect", net->proc_net); #endif /* CONFIG_NF_CONNTRACK_PROCFS */ } module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); int nf_conntrack_expect_pernet_init(struct net *net) { return exp_proc_init(net); } void nf_conntrack_expect_pernet_fini(struct net *net) { exp_proc_remove(net); } int nf_conntrack_expect_init(void) { if (!nf_ct_expect_hsize) { nf_ct_expect_hsize = nf_conntrack_htable_size / 256; if (!nf_ct_expect_hsize) nf_ct_expect_hsize = 1; } nf_ct_expect_max = nf_ct_expect_hsize * 4; nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", sizeof(struct nf_conntrack_expect), 0, 0, NULL); if (!nf_ct_expect_cachep) return -ENOMEM; nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); if (!nf_ct_expect_hash) { kmem_cache_destroy(nf_ct_expect_cachep); return -ENOMEM; } return 0; } void nf_conntrack_expect_fini(void) { rcu_barrier(); /* Wait for call_rcu() before destroy */ kmem_cache_destroy(nf_ct_expect_cachep); kvfree(nf_ct_expect_hash); }
21 21 21 21 21 21 16 18 18 18 18 18 16 18 18 18 21 16 18 18 16 16 16 16 16 16 17 18 3 3 15 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 // SPDX-License-Identifier: GPL-2.0-or-later /* * Spanning tree protocol; generic parts * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/rculist.h> #include <net/switchdev.h> #include "br_private.h" #include "br_private_stp.h" /* since time values in bpdu are in jiffies and then scaled (1/256) * before sending, make sure that is at least one STP tick. */ #define MESSAGE_AGE_INCR ((HZ / 256) + 1) static const char *const br_port_state_names[] = { [BR_STATE_DISABLED] = "disabled", [BR_STATE_LISTENING] = "listening", [BR_STATE_LEARNING] = "learning", [BR_STATE_FORWARDING] = "forwarding", [BR_STATE_BLOCKING] = "blocking", }; void br_set_state(struct net_bridge_port *p, unsigned int state) { struct switchdev_attr attr = { .orig_dev = p->dev, .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE, .flags = SWITCHDEV_F_DEFER, .u.stp_state = state, }; int err; /* Don't change the state of the ports if they are driven by a different * protocol. */ if (p->flags & BR_MRP_AWARE) return; p->state = state; if (br_opt_get(p->br, BROPT_MST_ENABLED)) { err = br_mst_set_state(p, 0, state, NULL); if (err) br_warn(p->br, "error setting MST state on port %u(%s)\n", p->port_no, netdev_name(p->dev)); } err = switchdev_port_attr_set(p->dev, &attr, NULL); if (err && err != -EOPNOTSUPP) br_warn(p->br, "error setting offload STP state on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name); else br_info(p->br, "port %u(%s) entered %s state\n", (unsigned int) p->port_no, p->dev->name, br_port_state_names[p->state]); if (p->br->stp_enabled == BR_KERNEL_STP) { switch (p->state) { case BR_STATE_BLOCKING: p->stp_xstats.transition_blk++; break; case BR_STATE_FORWARDING: p->stp_xstats.transition_fwd++; break; } } } u8 br_port_get_stp_state(const struct net_device *dev) { struct net_bridge_port *p; ASSERT_RTNL(); p = br_port_get_rtnl(dev); if (!p) return BR_STATE_DISABLED; return p->state; } EXPORT_SYMBOL_GPL(br_port_get_stp_state); /* called under bridge lock */ struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no) { struct net_bridge_port *p; list_for_each_entry_rcu(p, &br->port_list, list, lockdep_is_held(&br->lock)) { if (p->port_no == port_no) return p; } return NULL; } /* called under bridge lock */ static int br_should_become_root_port(const struct net_bridge_port *p, u16 root_port) { struct net_bridge *br; struct net_bridge_port *rp; int t; br = p->br; if (p->state == BR_STATE_DISABLED || br_is_designated_port(p)) return 0; if (memcmp(&br->bridge_id, &p->designated_root, 8) <= 0) return 0; if (!root_port) return 1; rp = br_get_port(br, root_port); t = memcmp(&p->designated_root, &rp->designated_root, 8); if (t < 0) return 1; else if (t > 0) return 0; if (p->designated_cost + p->path_cost < rp->designated_cost + rp->path_cost) return 1; else if (p->designated_cost + p->path_cost > rp->designated_cost + rp->path_cost) return 0; t = memcmp(&p->designated_bridge, &rp->designated_bridge, 8); if (t < 0) return 1; else if (t > 0) return 0; if (p->designated_port < rp->designated_port) return 1; else if (p->designated_port > rp->designated_port) return 0; if (p->port_id < rp->port_id) return 1; return 0; } static void br_root_port_block(const struct net_bridge *br, struct net_bridge_port *p) { br_notice(br, "port %u(%s) tried to become root port (blocked)", (unsigned int) p->port_no, p->dev->name); br_set_state(p, BR_STATE_LISTENING); br_ifinfo_notify(RTM_NEWLINK, NULL, p); if (br->forward_delay > 0) mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay); } /* called under bridge lock */ static void br_root_selection(struct net_bridge *br) { struct net_bridge_port *p; u16 root_port = 0; list_for_each_entry(p, &br->port_list, list) { if (!br_should_become_root_port(p, root_port)) continue; if (p->flags & BR_ROOT_BLOCK) br_root_port_block(br, p); else root_port = p->port_no; } br->root_port = root_port; if (!root_port) { br->designated_root = br->bridge_id; br->root_path_cost = 0; } else { p = br_get_port(br, root_port); br->designated_root = p->designated_root; br->root_path_cost = p->designated_cost + p->path_cost; } } /* called under bridge lock */ void br_become_root_bridge(struct net_bridge *br) { br->max_age = br->bridge_max_age; br->hello_time = br->bridge_hello_time; br->forward_delay = br->bridge_forward_delay; br_topology_change_detection(br); del_timer(&br->tcn_timer); if (br->dev->flags & IFF_UP) { br_config_bpdu_generation(br); mod_timer(&br->hello_timer, jiffies + br->hello_time); } } /* called under bridge lock */ void br_transmit_config(struct net_bridge_port *p) { struct br_config_bpdu bpdu; struct net_bridge *br; if (timer_pending(&p->hold_timer)) { p->config_pending = 1; return; } br = p->br; bpdu.topology_change = br->topology_change; bpdu.topology_change_ack = p->topology_change_ack; bpdu.root = br->designated_root; bpdu.root_path_cost = br->root_path_cost; bpdu.bridge_id = br->bridge_id; bpdu.port_id = p->port_id; if (br_is_root_bridge(br)) bpdu.message_age = 0; else { struct net_bridge_port *root = br_get_port(br, br->root_port); bpdu.message_age = (jiffies - root->designated_age) + MESSAGE_AGE_INCR; } bpdu.max_age = br->max_age; bpdu.hello_time = br->hello_time; bpdu.forward_delay = br->forward_delay; if (bpdu.message_age < br->max_age) { br_send_config_bpdu(p, &bpdu); p->topology_change_ack = 0; p->config_pending = 0; if (p->br->stp_enabled == BR_KERNEL_STP) mod_timer(&p->hold_timer, round_jiffies(jiffies + BR_HOLD_TIME)); } } /* called under bridge lock */ static void br_record_config_information(struct net_bridge_port *p, const struct br_config_bpdu *bpdu) { p->designated_root = bpdu->root; p->designated_cost = bpdu->root_path_cost; p->designated_bridge = bpdu->bridge_id; p->designated_port = bpdu->port_id; p->designated_age = jiffies - bpdu->message_age; mod_timer(&p->message_age_timer, jiffies + (bpdu->max_age - bpdu->message_age)); } /* called under bridge lock */ static void br_record_config_timeout_values(struct net_bridge *br, const struct br_config_bpdu *bpdu) { br->max_age = bpdu->max_age; br->hello_time = bpdu->hello_time; br->forward_delay = bpdu->forward_delay; __br_set_topology_change(br, bpdu->topology_change); } /* called under bridge lock */ void br_transmit_tcn(struct net_bridge *br) { struct net_bridge_port *p; p = br_get_port(br, br->root_port); if (p) br_send_tcn_bpdu(p); else br_notice(br, "root port %u not found for topology notice\n", br->root_port); } /* called under bridge lock */ static int br_should_become_designated_port(const struct net_bridge_port *p) { struct net_bridge *br; int t; br = p->br; if (br_is_designated_port(p)) return 1; if (memcmp(&p->designated_root, &br->designated_root, 8)) return 1; if (br->root_path_cost < p->designated_cost) return 1; else if (br->root_path_cost > p->designated_cost) return 0; t = memcmp(&br->bridge_id, &p->designated_bridge, 8); if (t < 0) return 1; else if (t > 0) return 0; if (p->port_id < p->designated_port) return 1; return 0; } /* called under bridge lock */ static void br_designated_port_selection(struct net_bridge *br) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED && br_should_become_designated_port(p)) br_become_designated_port(p); } } /* called under bridge lock */ static int br_supersedes_port_info(const struct net_bridge_port *p, const struct br_config_bpdu *bpdu) { int t; t = memcmp(&bpdu->root, &p->designated_root, 8); if (t < 0) return 1; else if (t > 0) return 0; if (bpdu->root_path_cost < p->designated_cost) return 1; else if (bpdu->root_path_cost > p->designated_cost) return 0; t = memcmp(&bpdu->bridge_id, &p->designated_bridge, 8); if (t < 0) return 1; else if (t > 0) return 0; if (memcmp(&bpdu->bridge_id, &p->br->bridge_id, 8)) return 1; if (bpdu->port_id <= p->designated_port) return 1; return 0; } /* called under bridge lock */ static void br_topology_change_acknowledged(struct net_bridge *br) { br->topology_change_detected = 0; del_timer(&br->tcn_timer); } /* called under bridge lock */ void br_topology_change_detection(struct net_bridge *br) { int isroot = br_is_root_bridge(br); if (br->stp_enabled != BR_KERNEL_STP) return; br_info(br, "topology change detected, %s\n", isroot ? "propagating" : "sending tcn bpdu"); if (isroot) { __br_set_topology_change(br, 1); mod_timer(&br->topology_change_timer, jiffies + br->bridge_forward_delay + br->bridge_max_age); } else if (!br->topology_change_detected) { br_transmit_tcn(br); mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time); } br->topology_change_detected = 1; } /* called under bridge lock */ void br_config_bpdu_generation(struct net_bridge *br) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED && br_is_designated_port(p)) br_transmit_config(p); } } /* called under bridge lock */ static void br_reply(struct net_bridge_port *p) { br_transmit_config(p); } /* called under bridge lock */ void br_configuration_update(struct net_bridge *br) { br_root_selection(br); br_designated_port_selection(br); } /* called under bridge lock */ void br_become_designated_port(struct net_bridge_port *p) { struct net_bridge *br; br = p->br; p->designated_root = br->designated_root; p->designated_cost = br->root_path_cost; p->designated_bridge = br->bridge_id; p->designated_port = p->port_id; } /* called under bridge lock */ static void br_make_blocking(struct net_bridge_port *p) { if (p->state != BR_STATE_DISABLED && p->state != BR_STATE_BLOCKING) { if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) br_topology_change_detection(p->br); br_set_state(p, BR_STATE_BLOCKING); br_ifinfo_notify(RTM_NEWLINK, NULL, p); del_timer(&p->forward_delay_timer); } } /* called under bridge lock */ static void br_make_forwarding(struct net_bridge_port *p) { struct net_bridge *br = p->br; if (p->state != BR_STATE_BLOCKING) return; if (br->stp_enabled == BR_NO_STP || br->forward_delay == 0) { br_set_state(p, BR_STATE_FORWARDING); br_topology_change_detection(br); del_timer(&p->forward_delay_timer); } else if (br->stp_enabled == BR_KERNEL_STP) br_set_state(p, BR_STATE_LISTENING); else br_set_state(p, BR_STATE_LEARNING); br_ifinfo_notify(RTM_NEWLINK, NULL, p); if (br->forward_delay != 0) mod_timer(&p->forward_delay_timer, jiffies + br->forward_delay); } /* called under bridge lock */ void br_port_state_selection(struct net_bridge *br) { struct net_bridge_port *p; unsigned int liveports = 0; list_for_each_entry(p, &br->port_list, list) { if (p->state == BR_STATE_DISABLED) continue; /* Don't change port states if userspace is handling STP */ if (br->stp_enabled != BR_USER_STP) { if (p->port_no == br->root_port) { p->config_pending = 0; p->topology_change_ack = 0; br_make_forwarding(p); } else if (br_is_designated_port(p)) { del_timer(&p->message_age_timer); br_make_forwarding(p); } else { p->config_pending = 0; p->topology_change_ack = 0; br_make_blocking(p); } } if (p->state != BR_STATE_BLOCKING) br_multicast_enable_port(p); /* Multicast is not disabled for the port when it goes in * blocking state because the timers will expire and stop by * themselves without sending more queries. */ if (p->state == BR_STATE_FORWARDING) ++liveports; } if (liveports == 0) netif_carrier_off(br->dev); else netif_carrier_on(br->dev); } /* called under bridge lock */ static void br_topology_change_acknowledge(struct net_bridge_port *p) { p->topology_change_ack = 1; br_transmit_config(p); } /* called under bridge lock */ void br_received_config_bpdu(struct net_bridge_port *p, const struct br_config_bpdu *bpdu) { struct net_bridge *br; int was_root; p->stp_xstats.rx_bpdu++; br = p->br; was_root = br_is_root_bridge(br); if (br_supersedes_port_info(p, bpdu)) { br_record_config_information(p, bpdu); br_configuration_update(br); br_port_state_selection(br); if (!br_is_root_bridge(br) && was_root) { del_timer(&br->hello_timer); if (br->topology_change_detected) { del_timer(&br->topology_change_timer); br_transmit_tcn(br); mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time); } } if (p->port_no == br->root_port) { br_record_config_timeout_values(br, bpdu); br_config_bpdu_generation(br); if (bpdu->topology_change_ack) br_topology_change_acknowledged(br); } } else if (br_is_designated_port(p)) { br_reply(p); } } /* called under bridge lock */ void br_received_tcn_bpdu(struct net_bridge_port *p) { p->stp_xstats.rx_tcn++; if (br_is_designated_port(p)) { br_info(p->br, "port %u(%s) received tcn bpdu\n", (unsigned int) p->port_no, p->dev->name); br_topology_change_detection(p->br); br_topology_change_acknowledge(p); } } /* Change bridge STP parameter */ int br_set_hello_time(struct net_bridge *br, unsigned long val) { unsigned long t = clock_t_to_jiffies(val); if (t < BR_MIN_HELLO_TIME || t > BR_MAX_HELLO_TIME) return -ERANGE; spin_lock_bh(&br->lock); br->bridge_hello_time = t; if (br_is_root_bridge(br)) br->hello_time = br->bridge_hello_time; spin_unlock_bh(&br->lock); return 0; } int br_set_max_age(struct net_bridge *br, unsigned long val) { unsigned long t = clock_t_to_jiffies(val); if (t < BR_MIN_MAX_AGE || t > BR_MAX_MAX_AGE) return -ERANGE; spin_lock_bh(&br->lock); br->bridge_max_age = t; if (br_is_root_bridge(br)) br->max_age = br->bridge_max_age; spin_unlock_bh(&br->lock); return 0; } /* called under bridge lock */ int __set_ageing_time(struct net_device *dev, unsigned long t) { struct switchdev_attr attr = { .orig_dev = dev, .id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER, .u.ageing_time = jiffies_to_clock_t(t), }; int err; err = switchdev_port_attr_set(dev, &attr, NULL); if (err && err != -EOPNOTSUPP) return err; return 0; } /* Set time interval that dynamic forwarding entries live * For pure software bridge, allow values outside the 802.1 * standard specification for special cases: * 0 - entry never ages (all permanent) * 1 - entry disappears (no persistence) * * Offloaded switch entries maybe more restrictive */ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time) { unsigned long t = clock_t_to_jiffies(ageing_time); int err; err = __set_ageing_time(br->dev, t); if (err) return err; spin_lock_bh(&br->lock); br->bridge_ageing_time = t; br->ageing_time = t; spin_unlock_bh(&br->lock); mod_delayed_work(system_long_wq, &br->gc_work, 0); return 0; } clock_t br_get_ageing_time(const struct net_device *br_dev) { const struct net_bridge *br; if (!netif_is_bridge_master(br_dev)) return 0; br = netdev_priv(br_dev); return jiffies_to_clock_t(br->ageing_time); } EXPORT_SYMBOL_GPL(br_get_ageing_time); /* called under bridge lock */ void __br_set_topology_change(struct net_bridge *br, unsigned char val) { unsigned long t; int err; if (br->stp_enabled == BR_KERNEL_STP && br->topology_change != val) { /* On topology change, set the bridge ageing time to twice the * forward delay. Otherwise, restore its default ageing time. */ if (val) { t = 2 * br->forward_delay; br_debug(br, "decreasing ageing time to %lu\n", t); } else { t = br->bridge_ageing_time; br_debug(br, "restoring ageing time to %lu\n", t); } err = __set_ageing_time(br->dev, t); if (err) br_warn(br, "error offloading ageing time\n"); else br->ageing_time = t; } br->topology_change = val; } void __br_set_forward_delay(struct net_bridge *br, unsigned long t) { br->bridge_forward_delay = t; if (br_is_root_bridge(br)) br->forward_delay = br->bridge_forward_delay; } int br_set_forward_delay(struct net_bridge *br, unsigned long val) { unsigned long t = clock_t_to_jiffies(val); int err = -ERANGE; spin_lock_bh(&br->lock); if (br->stp_enabled != BR_NO_STP && (t < BR_MIN_FORWARD_DELAY || t > BR_MAX_FORWARD_DELAY)) goto unlock; __br_set_forward_delay(br, t); err = 0; unlock: spin_unlock_bh(&br->lock); return err; }
27 26 28 46 45 40 39 28 27 27 11 46 12 46 7 4 6 6 6 9 4 2 4 2 1 25 13 4 4 2 1 1 11 9 9 12 15 2 12 12 3 3 3 3 4 3 2 4 7 4 4 20 20 17 3 2 1 15 1 15 14 20 1 2 5 6 5 5 2 3 2 1 1 7 18 18 17 17 12 11 2 19 1125 753 753 136 1765 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io */ /* Devmaps primary use is as a backend map for XDP BPF helper call * bpf_redirect_map(). Because XDP is mostly concerned with performance we * spent some effort to ensure the datapath with redirect maps does not use * any locking. This is a quick note on the details. * * We have three possible paths to get into the devmap control plane bpf * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall * will invoke an update, delete, or lookup operation. To ensure updates and * deletes appear atomic from the datapath side xchg() is used to modify the * netdev_map array. Then because the datapath does a lookup into the netdev_map * array (read-only) from an RCU critical section we use call_rcu() to wait for * an rcu grace period before free'ing the old data structures. This ensures the * datapath always has a valid copy. However, the datapath does a "flush" * operation that pushes any pending packets in the driver outside the RCU * critical section. Each bpf_dtab_netdev tracks these pending operations using * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until * this list is empty, indicating outstanding flush operations have completed. * * BPF syscalls may race with BPF program calls on any of the update, delete * or lookup operations. As noted above the xchg() operation also keep the * netdev_map consistent in this case. From the devmap side BPF programs * calling into these operations are the same as multiple user space threads * making system calls. * * Finally, any of the above may race with a netdev_unregister notifier. The * unregister notifier must search for net devices in the map structure that * contain a reference to the net device and remove them. This is a two step * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) * check to see if the ifindex is the same as the net_device being removed. * When removing the dev a cmpxchg() is used to ensure the correct dev is * removed, in the case of a concurrent update or delete operation it is * possible that the initially referenced dev is no longer in the map. As the * notifier hook walks the map we know that new dev references can not be * added by the user because core infrastructure ensures dev_get_by_index() * calls will fail at this point. * * The devmap_hash type is a map type which interprets keys as ifindexes and * indexes these using a hashmap. This allows maps that use ifindex as key to be * densely packed instead of having holes in the lookup array for unused * ifindexes. The setup and packet enqueue/send code is shared between the two * types of devmap; only the lookup and insertion is different. */ #include <linux/bpf.h> #include <net/xdp.h> #include <linux/filter.h> #include <trace/events/xdp.h> #include <linux/btf_ids.h> #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) struct xdp_dev_bulk_queue { struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct list_head flush_node; struct net_device *dev; struct net_device *dev_rx; struct bpf_prog *xdp_prog; unsigned int count; }; struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; struct bpf_devmap_val val; }; struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */ struct list_head list; /* these are only used for DEVMAP_HASH type maps */ struct hlist_head *dev_index_head; spinlock_t index_lock; unsigned int items; u32 n_buckets; }; static DEFINE_PER_CPU(struct list_head, dev_flush_list); static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); static struct hlist_head *dev_map_create_hash(unsigned int entries, int numa_node) { int i; struct hlist_head *hash; hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node); if (hash != NULL) for (i = 0; i < entries; i++) INIT_HLIST_HEAD(&hash[i]); return hash; } static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, int idx) { return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; } static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { u32 valsize = attr->value_size; /* check sanity of attributes. 2 value sizes supported: * 4 bytes: ifindex * 8 bytes: ifindex + prog fd */ if (attr->max_entries == 0 || attr->key_size != 4 || (valsize != offsetofend(struct bpf_devmap_val, ifindex) && valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the * verifier prevents writes from the BPF side */ attr->map_flags |= BPF_F_RDONLY_PROG; bpf_map_init_from_attr(&dtab->map, attr); if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); if (!dtab->n_buckets) /* Overflow check */ return -EINVAL; } if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, dtab->map.numa_node); if (!dtab->dev_index_head) return -ENOMEM; spin_lock_init(&dtab->index_lock); } else { dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) return -ENOMEM; } return 0; } static struct bpf_map *dev_map_alloc(union bpf_attr *attr) { struct bpf_dtab *dtab; int err; dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE); if (!dtab) return ERR_PTR(-ENOMEM); err = dev_map_init_map(dtab, attr); if (err) { bpf_map_area_free(dtab); return ERR_PTR(err); } spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; } static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); int i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were * disconnected from events. The following synchronize_rcu() guarantees * both rcu read critical sections complete and waits for * preempt-disable regions (NAPI being the relevant context here) so we * are certain there will be no further reads against the netdev_map and * all flush operations are complete. Flush operations can only be done * from NAPI context for this reason. */ spin_lock(&dev_map_lock); list_del_rcu(&dtab->list); spin_unlock(&dev_map_lock); bpf_clear_redirect_map(map); synchronize_rcu(); /* Make sure prior __dev_map_entry_free() have completed. */ rcu_barrier(); if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; struct hlist_head *head; struct hlist_node *next; head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); if (dev->xdp_prog) bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } } bpf_map_area_free(dtab->dev_index_head); } else { for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; dev = rcu_dereference_raw(dtab->netdev_map[i]); if (!dev) continue; if (dev->xdp_prog) bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } bpf_map_area_free(dtab->netdev_map); } bpf_map_area_free(dtab); } static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = next_key; if (index >= dtab->map.max_entries) { *next = 0; return 0; } if (index == dtab->map.max_entries - 1) return -ENOENT; *next = index + 1; return 0; } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct hlist_head *head = dev_map_index_hash(dtab, key); struct bpf_dtab_netdev *dev; hlist_for_each_entry_rcu(dev, head, index_hlist, lockdep_is_held(&dtab->index_lock)) if (dev->idx == key) return dev; return NULL; } static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u32 idx, *next = next_key; struct bpf_dtab_netdev *dev, *next_dev; struct hlist_head *head; int i = 0; if (!key) goto find_first; idx = *(u32 *)key; dev = __dev_map_hash_lookup_elem(map, idx); if (!dev) goto find_first; next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), struct bpf_dtab_netdev, index_hlist); if (next_dev) { *next = next_dev->idx; return 0; } i = idx & (dtab->n_buckets - 1); i++; find_first: for (; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), struct bpf_dtab_netdev, index_hlist); if (next_dev) { *next = next_dev->idx; return 0; } } return -ENOENT; } static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, struct xdp_frame **frames, int n, struct net_device *dev) { struct xdp_txq_info txq = { .dev = dev }; struct xdp_buff xdp; int i, nframes = 0; for (i = 0; i < n; i++) { struct xdp_frame *xdpf = frames[i]; u32 act; int err; xdp_convert_frame_to_buff(xdpf, &xdp); xdp.txq = &txq; act = bpf_prog_run_xdp(xdp_prog, &xdp); switch (act) { case XDP_PASS: err = xdp_update_frame_from_buff(&xdp, xdpf); if (unlikely(err < 0)) xdp_return_frame_rx_napi(xdpf); else frames[nframes++] = xdpf; break; default: bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dev, xdp_prog, act); fallthrough; case XDP_DROP: xdp_return_frame_rx_napi(xdpf); break; } } return nframes; /* sent frames count */ } static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; unsigned int cnt = bq->count; int sent = 0, err = 0; int to_send = cnt; int i; if (unlikely(!cnt)) return; for (i = 0; i < cnt; i++) { struct xdp_frame *xdpf = bq->q[i]; prefetch(xdpf); } if (bq->xdp_prog) { to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); if (!to_send) goto out; } sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags); if (sent < 0) { /* If ndo_xdp_xmit fails with an errno, no frames have * been xmit'ed. */ err = sent; sent = 0; } /* If not all frames have been transmitted, it is our * responsibility to free them */ for (i = sent; unlikely(i < to_send); i++) xdp_return_frame_rx_napi(bq->q[i]); out: bq->count = 0; trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err); } /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the * driver before returning from its napi->poll() routine. See the comment above * xdp_do_flush() in filter.c. */ void __dev_flush(void) { struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq, *tmp; list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { bq_xmit_all(bq, XDP_XMIT_FLUSH); bq->dev_rx = NULL; bq->xdp_prog = NULL; __list_del_clearprev(&bq->flush_node); } } /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or * by local_bh_disable() (from XDP calls inside NAPI). The * rcu_read_lock_bh_held() below makes lockdep accept both. */ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *obj; if (key >= map->max_entries) return NULL; obj = rcu_dereference_check(dtab->netdev_map[key], rcu_read_lock_bh_held()); return obj; } /* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu * variable access, and map elements stick around. See comment above * xdp_do_flush() in filter.c. */ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) bq_xmit_all(bq, 0); /* Ingress dev_rx will be the same for all xdp_frame's in * bulk_queue, because bq stored per-CPU and must be flushed * from net_device drivers NAPI func end. * * Do the same with xdp_prog and flush_list since these fields * are only ever modified together. */ if (!bq->dev_rx) { bq->dev_rx = dev_rx; bq->xdp_prog = xdp_prog; list_add(&bq->flush_node, flush_list); } bq->q[bq->count++] = xdpf; } static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { int err; if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) return -EOPNOTSUPP; if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && xdp_frame_has_frags(xdpf))) return -EOPNOTSUPP; err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf)); if (unlikely(err)) return err; bq_enqueue(dev, xdpf, dev_rx, xdp_prog); return 0; } static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst) { struct xdp_txq_info txq = { .dev = dst->dev }; struct xdp_buff xdp; u32 act; if (!dst->xdp_prog) return XDP_PASS; __skb_pull(skb, skb->mac_len); xdp.txq = &txq; act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog); switch (act) { case XDP_PASS: __skb_push(skb, skb->mac_len); break; default: bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dst->dev, dst->xdp_prog, act); fallthrough; case XDP_DROP: kfree_skb(skb); break; } return act; } int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { return __xdp_enqueue(dev, xdpf, dev_rx, NULL); } int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { struct net_device *dev = dst->dev; return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog); } static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) { if (!obj) return false; if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) return false; if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && xdp_frame_has_frags(xdpf))) return false; if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf))) return false; return true; } static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, struct net_device *dev_rx, struct xdp_frame *xdpf) { struct xdp_frame *nxdpf; nxdpf = xdpf_clone(xdpf); if (!nxdpf) return -ENOMEM; bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog); return 0; } static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex) { while (num_excluded--) { if (ifindex == excluded[num_excluded]) return true; } return false; } /* Get ifindex of each upper device. 'indexes' must be able to hold at * least MAX_NEST_DEV elements. * Returns the number of ifindexes added. */ static int get_upper_ifindexes(struct net_device *dev, int *indexes) { struct net_device *upper; struct list_head *iter; int n = 0; netdev_for_each_upper_dev_rcu(dev, upper, iter) { indexes[n++] = upper->ifindex; } return n; } int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; int num_excluded = 0; unsigned int i; int err; if (exclude_ingress) { num_excluded = get_upper_ifindexes(dev_rx, excluded_devices); excluded_devices[num_excluded++] = dev_rx->ifindex; } if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); if (err) return err; last_dst = dst; } } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); hlist_for_each_entry_rcu(dst, head, index_hlist, lockdep_is_held(&dtab->index_lock)) { if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); if (err) return err; last_dst = dst; } } } /* consume the last copy of the frame */ if (last_dst) bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog); else xdp_return_frame_rx_napi(xdpf); /* dtab is empty */ return 0; } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog) { int err; err = xdp_ok_fwd_dev(dst->dev, skb->len); if (unlikely(err)) return err; /* Redirect has already succeeded semantically at this point, so we just * return 0 even if packet is dropped. Helper below takes care of * freeing skb. */ if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS) return 0; skb->dev = dst->dev; generic_xdp_tx(skb, xdp_prog); return 0; } static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog) { struct sk_buff *nskb; int err; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return -ENOMEM; err = dev_map_generic_redirect(dst, nskb, xdp_prog); if (unlikely(err)) { consume_skb(nskb); return err; } return 0; } int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; struct hlist_node *next; int num_excluded = 0; unsigned int i; int err; if (exclude_ingress) { num_excluded = get_upper_ifindexes(dev, excluded_devices); excluded_devices[num_excluded++] = dev->ifindex; } if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); if (!dst) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_redirect_clone(last_dst, skb, xdp_prog); if (err) return err; last_dst = dst; } } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ for (i = 0; i < dtab->n_buckets; i++) { head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dst, next, head, index_hlist) { if (!dst) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) continue; /* we only need n-1 clones; last_dst enqueued below */ if (!last_dst) { last_dst = dst; continue; } err = dev_map_redirect_clone(last_dst, skb, xdp_prog); if (err) return err; last_dst = dst; } } } /* consume the first skb and return */ if (last_dst) return dev_map_generic_redirect(last_dst, skb, xdp_prog); /* dtab is empty */ consume_skb(skb); return 0; } static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); return obj ? &obj->val : NULL; } static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, *(u32 *)key); return obj ? &obj->val : NULL; } static void __dev_map_entry_free(struct rcu_head *rcu) { struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); if (dev->xdp_prog) bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } static long dev_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; int k = *(u32 *)key; if (k >= map->max_entries) return -EINVAL; old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL)); if (old_dev) { call_rcu(&old_dev->rcu, __dev_map_entry_free); atomic_dec((atomic_t *)&dtab->items); } return 0; } static long dev_map_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; int k = *(u32 *)key; unsigned long flags; int ret = -ENOENT; spin_lock_irqsave(&dtab->index_lock, flags); old_dev = __dev_map_hash_lookup_elem(map, k); if (old_dev) { dtab->items--; hlist_del_init_rcu(&old_dev->index_hlist); call_rcu(&old_dev->rcu, __dev_map_entry_free); ret = 0; } spin_unlock_irqrestore(&dtab->index_lock, flags); return ret; } static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, struct bpf_devmap_val *val, unsigned int idx) { struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), GFP_NOWAIT | __GFP_NOWARN, dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); dev->dev = dev_get_by_index(net, val->ifindex); if (!dev->dev) goto err_out; if (val->bpf_prog.fd > 0) { prog = bpf_prog_get_type_dev(val->bpf_prog.fd, BPF_PROG_TYPE_XDP, false); if (IS_ERR(prog)) goto err_put_dev; if (prog->expected_attach_type != BPF_XDP_DEVMAP || !bpf_prog_map_compatible(&dtab->map, prog)) goto err_put_prog; } dev->idx = idx; if (prog) { dev->xdp_prog = prog; dev->val.bpf_prog.id = prog->aux->id; } else { dev->xdp_prog = NULL; dev->val.bpf_prog.id = 0; } dev->val.ifindex = val->ifindex; return dev; err_put_prog: bpf_prog_put(prog); err_put_dev: dev_put(dev->dev); err_out: kfree(dev); return ERR_PTR(-EINVAL); } static long __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; struct bpf_devmap_val val = {}; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) return -EINVAL; if (unlikely(i >= dtab->map.max_entries)) return -E2BIG; if (unlikely(map_flags == BPF_NOEXIST)) return -EEXIST; /* already verified value_size <= sizeof val */ memcpy(&val, value, map->value_size); if (!val.ifindex) { dev = NULL; /* can not specify fd if ifindex is 0 */ if (val.bpf_prog.fd > 0) return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) return PTR_ERR(dev); } /* Use call_rcu() here to ensure rcu critical sections have completed * Remembering the driver side flush operation will happen before the * net device is removed. */ old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev))); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); else atomic_inc((atomic_t *)&dtab->items); return 0; } static long dev_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __dev_map_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dev, *old_dev; struct bpf_devmap_val val = {}; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; /* already verified value_size <= sizeof val */ memcpy(&val, value, map->value_size); if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) return -EINVAL; spin_lock_irqsave(&dtab->index_lock, flags); old_dev = __dev_map_hash_lookup_elem(map, idx); if (old_dev && (map_flags & BPF_NOEXIST)) goto out_err; dev = __dev_map_alloc_node(net, dtab, &val, idx); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out_err; } if (old_dev) { hlist_del_rcu(&old_dev->index_hlist); } else { if (dtab->items >= dtab->map.max_entries) { spin_unlock_irqrestore(&dtab->index_lock, flags); call_rcu(&dev->rcu, __dev_map_entry_free); return -E2BIG; } dtab->items++; } hlist_add_head_rcu(&dev->index_hlist, dev_map_index_hash(dtab, idx)); spin_unlock_irqrestore(&dtab->index_lock, flags); if (old_dev) call_rcu(&old_dev->rcu, __dev_map_entry_free); return 0; out_err: spin_unlock_irqrestore(&dtab->index_lock, flags); return err; } static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return __dev_map_hash_update_elem(current->nsproxy->net_ns, map, key, value, map_flags); } static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_lookup_elem); } static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_hash_lookup_elem); } static u64 dev_map_mem_usage(const struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); u64 usage = sizeof(struct bpf_dtab); if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) usage += (u64)dtab->n_buckets * sizeof(struct hlist_head); else usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *); usage += atomic_read((atomic_t *)&dtab->items) * (u64)sizeof(struct bpf_dtab_netdev); return usage; } BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab) const struct bpf_map_ops dev_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_get_next_key, .map_lookup_elem = dev_map_lookup_elem, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, .map_check_btf = map_check_no_btf, .map_mem_usage = dev_map_mem_usage, .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_map_redirect, }; const struct bpf_map_ops dev_map_hash_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = dev_map_alloc, .map_free = dev_map_free, .map_get_next_key = dev_map_hash_get_next_key, .map_lookup_elem = dev_map_hash_lookup_elem, .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, .map_check_btf = map_check_no_btf, .map_mem_usage = dev_map_mem_usage, .map_btf_id = &dev_map_btf_ids[0], .map_redirect = dev_hash_map_redirect, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, struct net_device *netdev) { unsigned long flags; u32 i; spin_lock_irqsave(&dtab->index_lock, flags); for (i = 0; i < dtab->n_buckets; i++) { struct bpf_dtab_netdev *dev; struct hlist_head *head; struct hlist_node *next; head = dev_map_index_hash(dtab, i); hlist_for_each_entry_safe(dev, next, head, index_hlist) { if (netdev != dev->dev) continue; dtab->items--; hlist_del_rcu(&dev->index_hlist); call_rcu(&dev->rcu, __dev_map_entry_free); } } spin_unlock_irqrestore(&dtab->index_lock, flags); } static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct bpf_dtab *dtab; int i, cpu; switch (event) { case NETDEV_REGISTER: if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq) break; /* will be freed in free_netdev() */ netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue); if (!netdev->xdp_bulkq) return NOTIFY_BAD; for_each_possible_cpu(cpu) per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; break; case NETDEV_UNREGISTER: /* This rcu_read_lock/unlock pair is needed because * dev_map_list is an RCU list AND to ensure a delete * operation does not free a netdev_map entry while we * are comparing it against the netdev being unregistered. */ rcu_read_lock(); list_for_each_entry_rcu(dtab, &dev_map_list, list) { if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dev_map_hash_remove_netdev(dtab, netdev); continue; } for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev, *odev; dev = rcu_dereference(dtab->netdev_map[i]); if (!dev || netdev != dev->dev) continue; odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL)); if (dev == odev) { call_rcu(&dev->rcu, __dev_map_entry_free); atomic_dec((atomic_t *)&dtab->items); } } } rcu_read_unlock(); break; default: break; } return NOTIFY_OK; } static struct notifier_block dev_map_notifier = { .notifier_call = dev_map_notification, }; static int __init dev_map_init(void) { int cpu; /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != offsetof(struct _bpf_dtab_netdev, dev)); register_netdevice_notifier(&dev_map_notifier); for_each_possible_cpu(cpu) INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu)); return 0; } subsys_initcall(dev_map_init);
62 447 1 62 112 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_LOCAL_H #define _ASM_X86_LOCAL_H #include <linux/percpu.h> #include <linux/atomic.h> #include <asm/asm.h> typedef struct { atomic_long_t a; } local_t; #define LOCAL_INIT(i) { ATOMIC_LONG_INIT(i) } #define local_read(l) atomic_long_read(&(l)->a) #define local_set(l, i) atomic_long_set(&(l)->a, (i)) static inline void local_inc(local_t *l) { asm volatile(_ASM_INC "%0" : "+m" (l->a.counter)); } static inline void local_dec(local_t *l) { asm volatile(_ASM_DEC "%0" : "+m" (l->a.counter)); } static inline void local_add(long i, local_t *l) { asm volatile(_ASM_ADD "%1,%0" : "+m" (l->a.counter) : "ir" (i)); } static inline void local_sub(long i, local_t *l) { asm volatile(_ASM_SUB "%1,%0" : "+m" (l->a.counter) : "ir" (i)); } /** * local_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @l: pointer to type local_t * * Atomically subtracts @i from @l and returns * true if the result is zero, or false for all * other cases. */ static inline bool local_sub_and_test(long i, local_t *l) { return GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, e, "er", i); } /** * local_dec_and_test - decrement and test * @l: pointer to type local_t * * Atomically decrements @l by 1 and * returns true if the result is 0, or false for all other * cases. */ static inline bool local_dec_and_test(local_t *l) { return GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, e); } /** * local_inc_and_test - increment and test * @l: pointer to type local_t * * Atomically increments @l by 1 * and returns true if the result is zero, or false for all * other cases. */ static inline bool local_inc_and_test(local_t *l) { return GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, e); } /** * local_add_negative - add and test if negative * @i: integer value to add * @l: pointer to type local_t * * Atomically adds @i to @l and returns true * if the result is negative, or false when * result is greater than or equal to zero. */ static inline bool local_add_negative(long i, local_t *l) { return GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, s, "er", i); } /** * local_add_return - add and return * @i: integer value to add * @l: pointer to type local_t * * Atomically adds @i to @l and returns @i + @l */ static inline long local_add_return(long i, local_t *l) { long __i = i; asm volatile(_ASM_XADD "%0, %1;" : "+r" (i), "+m" (l->a.counter) : : "memory"); return i + __i; } static inline long local_sub_return(long i, local_t *l) { return local_add_return(-i, l); } #define local_inc_return(l) (local_add_return(1, l)) #define local_dec_return(l) (local_sub_return(1, l)) static inline long local_cmpxchg(local_t *l, long old, long new) { return cmpxchg_local(&l->a.counter, old, new); } static inline bool local_try_cmpxchg(local_t *l, long *old, long new) { return try_cmpxchg_local(&l->a.counter, (typeof(l->a.counter) *) old, new); } /* Always has a lock prefix */ #define local_xchg(l, n) (xchg(&((l)->a.counter), (n))) /** * local_add_unless - add unless the number is a given value * @l: pointer of type local_t * @a: the amount to add to l... * @u: ...unless l is equal to u. * * Atomically adds @a to @l, so long as it was not @u. * Returns non-zero if @l was not @u, and zero otherwise. */ #define local_add_unless(l, a, u) \ ({ \ long c, old; \ c = local_read((l)); \ for (;;) { \ if (unlikely(c == (u))) \ break; \ old = local_cmpxchg((l), c, c + (a)); \ if (likely(old == c)) \ break; \ c = old; \ } \ c != (u); \ }) #define local_inc_not_zero(l) local_add_unless((l), 1, 0) /* On x86_32, these are no better than the atomic variants. * On x86-64 these are better than the atomic variants on SMP kernels * because they dont use a lock prefix. */ #define __local_inc(l) local_inc(l) #define __local_dec(l) local_dec(l) #define __local_add(i, l) local_add((i), (l)) #define __local_sub(i, l) local_sub((i), (l)) #endif /* _ASM_X86_LOCAL_H */
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 /* * ppp_mppe.c - interface MPPE to the PPP code. * This version is for use with Linux kernel 2.6.14+ * * By Frank Cusack <fcusack@fcusack.com>. * Copyright (c) 2002,2003,2004 Google, Inc. * All rights reserved. * * License: * Permission to use, copy, modify, and distribute this software and its * documentation is hereby granted, provided that the above copyright * notice appears in all copies. This software is provided without any * warranty, express or implied. * * ALTERNATIVELY, provided that this notice is retained in full, this product * may be distributed under the terms of the GNU General Public License (GPL), * in which case the provisions of the GPL apply INSTEAD OF those given above. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see <http://www.gnu.org/licenses/>. * * * Changelog: * 08/12/05 - Matt Domsch <Matt_Domsch@dell.com> * Only need extra skb padding on transmit, not receive. * 06/18/04 - Matt Domsch <Matt_Domsch@dell.com>, Oleg Makarenko <mole@quadra.ru> * Use Linux kernel 2.6 arc4 and sha1 routines rather than * providing our own. * 2/15/04 - TS: added #include <version.h> and testing for Kernel * version before using * MOD_DEC_USAGE_COUNT/MOD_INC_USAGE_COUNT which are * deprecated in 2.6 */ #include <crypto/arc4.h> #include <crypto/hash.h> #include <linux/err.h> #include <linux/fips.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/ppp_defs.h> #include <linux/ppp-comp.h> #include <linux/scatterlist.h> #include <asm/unaligned.h> #include "ppp_mppe.h" MODULE_AUTHOR("Frank Cusack <fcusack@fcusack.com>"); MODULE_DESCRIPTION("Point-to-Point Protocol Microsoft Point-to-Point Encryption support"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("ppp-compress-" __stringify(CI_MPPE)); MODULE_VERSION("1.0.2"); #define SHA1_PAD_SIZE 40 /* * kernel crypto API needs its arguments to be in kmalloc'd memory, not in the module * static data area. That means sha_pad needs to be kmalloc'd. */ struct sha_pad { unsigned char sha_pad1[SHA1_PAD_SIZE]; unsigned char sha_pad2[SHA1_PAD_SIZE]; }; static struct sha_pad *sha_pad; static inline void sha_pad_init(struct sha_pad *shapad) { memset(shapad->sha_pad1, 0x00, sizeof(shapad->sha_pad1)); memset(shapad->sha_pad2, 0xF2, sizeof(shapad->sha_pad2)); } /* * State for an MPPE (de)compressor. */ struct ppp_mppe_state { struct arc4_ctx arc4; struct shash_desc *sha1; unsigned char *sha1_digest; unsigned char master_key[MPPE_MAX_KEY_LEN]; unsigned char session_key[MPPE_MAX_KEY_LEN]; unsigned keylen; /* key length in bytes */ /* NB: 128-bit == 16, 40-bit == 8! */ /* If we want to support 56-bit, */ /* the unit has to change to bits */ unsigned char bits; /* MPPE control bits */ unsigned ccount; /* 12-bit coherency count (seqno) */ unsigned stateful; /* stateful mode flag */ int discard; /* stateful mode packet loss flag */ int sanity_errors; /* take down LCP if too many */ int unit; int debug; struct compstat stats; }; /* struct ppp_mppe_state.bits definitions */ #define MPPE_BIT_A 0x80 /* Encryption table were (re)inititalized */ #define MPPE_BIT_B 0x40 /* MPPC only (not implemented) */ #define MPPE_BIT_C 0x20 /* MPPC only (not implemented) */ #define MPPE_BIT_D 0x10 /* This is an encrypted frame */ #define MPPE_BIT_FLUSHED MPPE_BIT_A #define MPPE_BIT_ENCRYPTED MPPE_BIT_D #define MPPE_BITS(p) ((p)[4] & 0xf0) #define MPPE_CCOUNT(p) ((((p)[4] & 0x0f) << 8) + (p)[5]) #define MPPE_CCOUNT_SPACE 0x1000 /* The size of the ccount space */ #define MPPE_OVHD 2 /* MPPE overhead/packet */ #define SANITY_MAX 1600 /* Max bogon factor we will tolerate */ /* * Key Derivation, from RFC 3078, RFC 3079. * Equivalent to Get_Key() for MS-CHAP as described in RFC 3079. */ static void get_new_key_from_sha(struct ppp_mppe_state * state) { crypto_shash_init(state->sha1); crypto_shash_update(state->sha1, state->master_key, state->keylen); crypto_shash_update(state->sha1, sha_pad->sha_pad1, sizeof(sha_pad->sha_pad1)); crypto_shash_update(state->sha1, state->session_key, state->keylen); crypto_shash_update(state->sha1, sha_pad->sha_pad2, sizeof(sha_pad->sha_pad2)); crypto_shash_final(state->sha1, state->sha1_digest); } /* * Perform the MPPE rekey algorithm, from RFC 3078, sec. 7.3. * Well, not what's written there, but rather what they meant. */ static void mppe_rekey(struct ppp_mppe_state * state, int initial_key) { get_new_key_from_sha(state); if (!initial_key) { arc4_setkey(&state->arc4, state->sha1_digest, state->keylen); arc4_crypt(&state->arc4, state->session_key, state->sha1_digest, state->keylen); } else { memcpy(state->session_key, state->sha1_digest, state->keylen); } if (state->keylen == 8) { /* See RFC 3078 */ state->session_key[0] = 0xd1; state->session_key[1] = 0x26; state->session_key[2] = 0x9e; } arc4_setkey(&state->arc4, state->session_key, state->keylen); } /* * Allocate space for a (de)compressor. */ static void *mppe_alloc(unsigned char *options, int optlen) { struct ppp_mppe_state *state; struct crypto_shash *shash; unsigned int digestsize; if (optlen != CILEN_MPPE + sizeof(state->master_key) || options[0] != CI_MPPE || options[1] != CILEN_MPPE || fips_enabled) goto out; state = kzalloc(sizeof(*state), GFP_KERNEL); if (state == NULL) goto out; shash = crypto_alloc_shash("sha1", 0, 0); if (IS_ERR(shash)) goto out_free; state->sha1 = kmalloc(sizeof(*state->sha1) + crypto_shash_descsize(shash), GFP_KERNEL); if (!state->sha1) { crypto_free_shash(shash); goto out_free; } state->sha1->tfm = shash; digestsize = crypto_shash_digestsize(shash); if (digestsize < MPPE_MAX_KEY_LEN) goto out_free; state->sha1_digest = kmalloc(digestsize, GFP_KERNEL); if (!state->sha1_digest) goto out_free; /* Save keys. */ memcpy(state->master_key, &options[CILEN_MPPE], sizeof(state->master_key)); memcpy(state->session_key, state->master_key, sizeof(state->master_key)); /* * We defer initial key generation until mppe_init(), as mppe_alloc() * is called frequently during negotiation. */ return (void *)state; out_free: kfree(state->sha1_digest); if (state->sha1) { crypto_free_shash(state->sha1->tfm); kfree_sensitive(state->sha1); } kfree(state); out: return NULL; } /* * Deallocate space for a (de)compressor. */ static void mppe_free(void *arg) { struct ppp_mppe_state *state = (struct ppp_mppe_state *) arg; if (state) { kfree(state->sha1_digest); crypto_free_shash(state->sha1->tfm); kfree_sensitive(state->sha1); kfree_sensitive(state); } } /* * Initialize (de)compressor state. */ static int mppe_init(void *arg, unsigned char *options, int optlen, int unit, int debug, const char *debugstr) { struct ppp_mppe_state *state = (struct ppp_mppe_state *) arg; unsigned char mppe_opts; if (optlen != CILEN_MPPE || options[0] != CI_MPPE || options[1] != CILEN_MPPE) return 0; MPPE_CI_TO_OPTS(&options[2], mppe_opts); if (mppe_opts & MPPE_OPT_128) state->keylen = 16; else if (mppe_opts & MPPE_OPT_40) state->keylen = 8; else { printk(KERN_WARNING "%s[%d]: unknown key length\n", debugstr, unit); return 0; } if (mppe_opts & MPPE_OPT_STATEFUL) state->stateful = 1; /* Generate the initial session key. */ mppe_rekey(state, 1); if (debug) { printk(KERN_DEBUG "%s[%d]: initialized with %d-bit %s mode\n", debugstr, unit, (state->keylen == 16) ? 128 : 40, (state->stateful) ? "stateful" : "stateless"); printk(KERN_DEBUG "%s[%d]: keys: master: %*phN initial session: %*phN\n", debugstr, unit, (int)sizeof(state->master_key), state->master_key, (int)sizeof(state->session_key), state->session_key); } /* * Initialize the coherency count. The initial value is not specified * in RFC 3078, but we can make a reasonable assumption that it will * start at 0. Setting it to the max here makes the comp/decomp code * do the right thing (determined through experiment). */ state->ccount = MPPE_CCOUNT_SPACE - 1; /* * Note that even though we have initialized the key table, we don't * set the FLUSHED bit. This is contrary to RFC 3078, sec. 3.1. */ state->bits = MPPE_BIT_ENCRYPTED; state->unit = unit; state->debug = debug; return 1; } static int mppe_comp_init(void *arg, unsigned char *options, int optlen, int unit, int hdrlen, int debug) { /* ARGSUSED */ return mppe_init(arg, options, optlen, unit, debug, "mppe_comp_init"); } /* * We received a CCP Reset-Request (actually, we are sending a Reset-Ack), * tell the compressor to rekey. Note that we MUST NOT rekey for * every CCP Reset-Request; we only rekey on the next xmit packet. * We might get multiple CCP Reset-Requests if our CCP Reset-Ack is lost. * So, rekeying for every CCP Reset-Request is broken as the peer will not * know how many times we've rekeyed. (If we rekey and THEN get another * CCP Reset-Request, we must rekey again.) */ static void mppe_comp_reset(void *arg) { struct ppp_mppe_state *state = (struct ppp_mppe_state *) arg; state->bits |= MPPE_BIT_FLUSHED; } /* * Compress (encrypt) a packet. * It's strange to call this a compressor, since the output is always * MPPE_OVHD + 2 bytes larger than the input. */ static int mppe_compress(void *arg, unsigned char *ibuf, unsigned char *obuf, int isize, int osize) { struct ppp_mppe_state *state = (struct ppp_mppe_state *) arg; int proto; /* * Check that the protocol is in the range we handle. */ proto = PPP_PROTOCOL(ibuf); if (proto < 0x0021 || proto > 0x00fa) return 0; /* Make sure we have enough room to generate an encrypted packet. */ if (osize < isize + MPPE_OVHD + 2) { /* Drop the packet if we should encrypt it, but can't. */ printk(KERN_DEBUG "mppe_compress[%d]: osize too small! " "(have: %d need: %d)\n", state->unit, osize, osize + MPPE_OVHD + 2); return -1; } osize = isize + MPPE_OVHD + 2; /* * Copy over the PPP header and set control bits. */ obuf[0] = PPP_ADDRESS(ibuf); obuf[1] = PPP_CONTROL(ibuf); put_unaligned_be16(PPP_COMP, obuf + 2); obuf += PPP_HDRLEN; state->ccount = (state->ccount + 1) % MPPE_CCOUNT_SPACE; if (state->debug >= 7) printk(KERN_DEBUG "mppe_compress[%d]: ccount %d\n", state->unit, state->ccount); put_unaligned_be16(state->ccount, obuf); if (!state->stateful || /* stateless mode */ ((state->ccount & 0xff) == 0xff) || /* "flag" packet */ (state->bits & MPPE_BIT_FLUSHED)) { /* CCP Reset-Request */ /* We must rekey */ if (state->debug && state->stateful) printk(KERN_DEBUG "mppe_compress[%d]: rekeying\n", state->unit); mppe_rekey(state, 0); state->bits |= MPPE_BIT_FLUSHED; } obuf[0] |= state->bits; state->bits &= ~MPPE_BIT_FLUSHED; /* reset for next xmit */ obuf += MPPE_OVHD; ibuf += 2; /* skip to proto field */ isize -= 2; arc4_crypt(&state->arc4, obuf, ibuf, isize); state->stats.unc_bytes += isize; state->stats.unc_packets++; state->stats.comp_bytes += osize; state->stats.comp_packets++; return osize; } /* * Since every frame grows by MPPE_OVHD + 2 bytes, this is always going * to look bad ... and the longer the link is up the worse it will get. */ static void mppe_comp_stats(void *arg, struct compstat *stats) { struct ppp_mppe_state *state = (struct ppp_mppe_state *) arg; *stats = state->stats; } static int mppe_decomp_init(void *arg, unsigned char *options, int optlen, int unit, int hdrlen, int mru, int debug) { /* ARGSUSED */ return mppe_init(arg, options, optlen, unit, debug, "mppe_decomp_init"); } /* * We received a CCP Reset-Ack. Just ignore it. */ static void mppe_decomp_reset(void *arg) { /* ARGSUSED */ return; } /* * Decompress (decrypt) an MPPE packet. */ static int mppe_decompress(void *arg, unsigned char *ibuf, int isize, unsigned char *obuf, int osize) { struct ppp_mppe_state *state = (struct ppp_mppe_state *) arg; unsigned ccount; int flushed = MPPE_BITS(ibuf) & MPPE_BIT_FLUSHED; if (isize <= PPP_HDRLEN + MPPE_OVHD) { if (state->debug) printk(KERN_DEBUG "mppe_decompress[%d]: short pkt (%d)\n", state->unit, isize); return DECOMP_ERROR; } /* * Make sure we have enough room to decrypt the packet. * Note that for our test we only subtract 1 byte whereas in * mppe_compress() we added 2 bytes (+MPPE_OVHD); * this is to account for possible PFC. */ if (osize < isize - MPPE_OVHD - 1) { printk(KERN_DEBUG "mppe_decompress[%d]: osize too small! " "(have: %d need: %d)\n", state->unit, osize, isize - MPPE_OVHD - 1); return DECOMP_ERROR; } osize = isize - MPPE_OVHD - 2; /* assume no PFC */ ccount = MPPE_CCOUNT(ibuf); if (state->debug >= 7) printk(KERN_DEBUG "mppe_decompress[%d]: ccount %d\n", state->unit, ccount); /* sanity checks -- terminate with extreme prejudice */ if (!(MPPE_BITS(ibuf) & MPPE_BIT_ENCRYPTED)) { printk(KERN_DEBUG "mppe_decompress[%d]: ENCRYPTED bit not set!\n", state->unit); state->sanity_errors += 100; goto sanity_error; } if (!state->stateful && !flushed) { printk(KERN_DEBUG "mppe_decompress[%d]: FLUSHED bit not set in " "stateless mode!\n", state->unit); state->sanity_errors += 100; goto sanity_error; } if (state->stateful && ((ccount & 0xff) == 0xff) && !flushed) { printk(KERN_DEBUG "mppe_decompress[%d]: FLUSHED bit not set on " "flag packet!\n", state->unit); state->sanity_errors += 100; goto sanity_error; } /* * Check the coherency count. */ if (!state->stateful) { /* Discard late packet */ if ((ccount - state->ccount) % MPPE_CCOUNT_SPACE > MPPE_CCOUNT_SPACE / 2) { state->sanity_errors++; goto sanity_error; } /* RFC 3078, sec 8.1. Rekey for every packet. */ while (state->ccount != ccount) { mppe_rekey(state, 0); state->ccount = (state->ccount + 1) % MPPE_CCOUNT_SPACE; } } else { /* RFC 3078, sec 8.2. */ if (!state->discard) { /* normal state */ state->ccount = (state->ccount + 1) % MPPE_CCOUNT_SPACE; if (ccount != state->ccount) { /* * (ccount > state->ccount) * Packet loss detected, enter the discard state. * Signal the peer to rekey (by sending a CCP Reset-Request). */ state->discard = 1; return DECOMP_ERROR; } } else { /* discard state */ if (!flushed) { /* ccp.c will be silent (no additional CCP Reset-Requests). */ return DECOMP_ERROR; } else { /* Rekey for every missed "flag" packet. */ while ((ccount & ~0xff) != (state->ccount & ~0xff)) { mppe_rekey(state, 0); state->ccount = (state->ccount + 256) % MPPE_CCOUNT_SPACE; } /* reset */ state->discard = 0; state->ccount = ccount; /* * Another problem with RFC 3078 here. It implies that the * peer need not send a Reset-Ack packet. But RFC 1962 * requires it. Hopefully, M$ does send a Reset-Ack; even * though it isn't required for MPPE synchronization, it is * required to reset CCP state. */ } } if (flushed) mppe_rekey(state, 0); } /* * Fill in the first part of the PPP header. The protocol field * comes from the decrypted data. */ obuf[0] = PPP_ADDRESS(ibuf); /* +1 */ obuf[1] = PPP_CONTROL(ibuf); /* +1 */ obuf += 2; ibuf += PPP_HDRLEN + MPPE_OVHD; isize -= PPP_HDRLEN + MPPE_OVHD; /* -6 */ /* net osize: isize-4 */ /* * Decrypt the first byte in order to check if it is * a compressed or uncompressed protocol field. */ arc4_crypt(&state->arc4, obuf, ibuf, 1); /* * Do PFC decompression. * This would be nicer if we were given the actual sk_buff * instead of a char *. */ if ((obuf[0] & 0x01) != 0) { obuf[1] = obuf[0]; obuf[0] = 0; obuf++; osize++; } /* And finally, decrypt the rest of the packet. */ arc4_crypt(&state->arc4, obuf + 1, ibuf + 1, isize - 1); state->stats.unc_bytes += osize; state->stats.unc_packets++; state->stats.comp_bytes += isize; state->stats.comp_packets++; /* good packet credit */ state->sanity_errors >>= 1; return osize; sanity_error: if (state->sanity_errors < SANITY_MAX) return DECOMP_ERROR; else /* Take LCP down if the peer is sending too many bogons. * We don't want to do this for a single or just a few * instances since it could just be due to packet corruption. */ return DECOMP_FATALERROR; } /* * Incompressible data has arrived (this should never happen!). * We should probably drop the link if the protocol is in the range * of what should be encrypted. At the least, we should drop this * packet. (How to do this?) */ static void mppe_incomp(void *arg, unsigned char *ibuf, int icnt) { struct ppp_mppe_state *state = (struct ppp_mppe_state *) arg; if (state->debug && (PPP_PROTOCOL(ibuf) >= 0x0021 && PPP_PROTOCOL(ibuf) <= 0x00fa)) printk(KERN_DEBUG "mppe_incomp[%d]: incompressible (unencrypted) data! " "(proto %04x)\n", state->unit, PPP_PROTOCOL(ibuf)); state->stats.inc_bytes += icnt; state->stats.inc_packets++; state->stats.unc_bytes += icnt; state->stats.unc_packets++; } /************************************************************* * Module interface table *************************************************************/ /* * Procedures exported to if_ppp.c. */ static struct compressor ppp_mppe = { .compress_proto = CI_MPPE, .comp_alloc = mppe_alloc, .comp_free = mppe_free, .comp_init = mppe_comp_init, .comp_reset = mppe_comp_reset, .compress = mppe_compress, .comp_stat = mppe_comp_stats, .decomp_alloc = mppe_alloc, .decomp_free = mppe_free, .decomp_init = mppe_decomp_init, .decomp_reset = mppe_decomp_reset, .decompress = mppe_decompress, .incomp = mppe_incomp, .decomp_stat = mppe_comp_stats, .owner = THIS_MODULE, .comp_extra = MPPE_PAD, }; /* * ppp_mppe_init() * * Prior to allowing load, try to load the arc4 and sha1 crypto * libraries. The actual use will be allocated later, but * this way the module will fail to insmod if they aren't available. */ static int __init ppp_mppe_init(void) { int answer; if (fips_enabled || !crypto_has_ahash("sha1", 0, CRYPTO_ALG_ASYNC)) return -ENODEV; sha_pad = kmalloc(sizeof(struct sha_pad), GFP_KERNEL); if (!sha_pad) return -ENOMEM; sha_pad_init(sha_pad); answer = ppp_register_compressor(&ppp_mppe); if (answer == 0) printk(KERN_INFO "PPP MPPE Compression module registered\n"); else kfree(sha_pad); return answer; } static void __exit ppp_mppe_cleanup(void) { ppp_unregister_compressor(&ppp_mppe); kfree(sha_pad); } module_init(ppp_mppe_init); module_exit(ppp_mppe_cleanup);
3489 3491 3489 3491 3490 3490 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 // SPDX-License-Identifier: GPL-2.0-or-later /* * The "hash function" used as the core of the ChaCha stream cipher (RFC7539) * * Copyright (C) 2015 Martin Willi */ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/bitops.h> #include <linux/string.h> #include <asm/unaligned.h> #include <crypto/chacha.h> static void chacha_permute(u32 *x, int nrounds) { int i; /* whitelist the allowed round counts */ WARN_ON_ONCE(nrounds != 20 && nrounds != 12); for (i = 0; i < nrounds; i += 2) { x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16); x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16); x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16); x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16); x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12); x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12); x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12); x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12); x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8); x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8); x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8); x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8); x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7); x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7); x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7); x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7); x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16); x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16); x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16); x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16); x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12); x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12); x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12); x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12); x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8); x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8); x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8); x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8); x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7); x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7); x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7); x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7); } } /** * chacha_block_generic - generate one keystream block and increment block counter * @state: input state matrix (16 32-bit words) * @stream: output keystream block (64 bytes) * @nrounds: number of rounds (20 or 12; 20 is recommended) * * This is the ChaCha core, a function from 64-byte strings to 64-byte strings. * The caller has already converted the endianness of the input. This function * also handles incrementing the block counter in the input matrix. */ void chacha_block_generic(u32 *state, u8 *stream, int nrounds) { u32 x[16]; int i; memcpy(x, state, 64); chacha_permute(x, nrounds); for (i = 0; i < ARRAY_SIZE(x); i++) put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]); state[12]++; } EXPORT_SYMBOL(chacha_block_generic); /** * hchacha_block_generic - abbreviated ChaCha core, for XChaCha * @state: input state matrix (16 32-bit words) * @stream: output (8 32-bit words) * @nrounds: number of rounds (20 or 12; 20 is recommended) * * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha * skips the final addition of the initial state, and outputs only certain words * of the state. It should not be used for streaming directly. */ void hchacha_block_generic(const u32 *state, u32 *stream, int nrounds) { u32 x[16]; memcpy(x, state, 64); chacha_permute(x, nrounds); memcpy(&stream[0], &x[0], 16); memcpy(&stream[4], &x[12], 16); } EXPORT_SYMBOL(hchacha_block_generic);
7 5 5 5 1 5 1 5 5 5 5 5 5 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic INET6 transport hashtables * * Authors: Lotsa people, from code originally in tcp, generalised here * by Arnaldo Carvalho de Melo <acme@mandriva.com> */ #include <linux/module.h> #include <linux/random.h> #include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> #include <net/sock_reuseport.h> u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport) { static u32 inet6_ehash_secret __read_mostly; static u32 ipv6_hash_secret __read_mostly; u32 lhash, fhash; net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret)); net_get_random_once(&ipv6_hash_secret, sizeof(ipv6_hash_secret)); lhash = (__force u32)laddr->s6_addr32[3]; fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret); return __inet6_ehashfn(lhash, lport, fhash, fport, inet6_ehash_secret + net_hash_mix(net)); } EXPORT_SYMBOL_GPL(inet6_ehashfn); /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM * * The sockhash lock must be held as a reader here. */ struct sock *__inet6_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, const int dif, const int sdif) { struct sock *sk; const struct hlist_nulls_node *node; const __portpair ports = INET_COMBINED_PORTS(sport, hnum); /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); unsigned int slot = hash & hashinfo->ehash_mask; struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) continue; if (!inet6_match(net, sk, saddr, daddr, ports, dif, sdif)) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) goto out; if (unlikely(!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))) { sock_gen_put(sk); goto begin; } goto found; } if (get_nulls_value(node) != slot) goto begin; out: sk = NULL; found: return sk; } EXPORT_SYMBOL(__inet6_lookup_established); static inline int compute_score(struct sock *sk, struct net *net, const unsigned short hnum, const struct in6_addr *daddr, const int dif, const int sdif) { int score = -1; if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum && sk->sk_family == PF_INET6) { if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return -1; if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; score = sk->sk_bound_dev_if ? 2 : 1; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } return score; } /** * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary. * @net: network namespace. * @sk: AF_INET6 socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP. * @skb: context for a potential SK_REUSEPORT program. * @doff: header offset. * @saddr: source address. * @sport: source port. * @daddr: destination address. * @hnum: destination port in host byte order. * @ehashfn: hash function used to generate the fallback hash. * * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to * the selected sock or an error. */ struct sock *inet6_lookup_reuseport(struct net *net, struct sock *sk, struct sk_buff *skb, int doff, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned short hnum, inet6_ehashfn_t *ehashfn) { struct sock *reuse_sk = NULL; u32 phash; if (sk->sk_reuseport) { phash = INDIRECT_CALL_INET(ehashfn, udp6_ehashfn, inet6_ehashfn, net, daddr, hnum, saddr, sport); reuse_sk = reuseport_select_sock(sk, phash, skb, doff); } return reuse_sk; } EXPORT_SYMBOL_GPL(inet6_lookup_reuseport); /* called with rcu_read_lock() */ static struct sock *inet6_lhash2_lookup(struct net *net, struct inet_listen_hashbucket *ilb2, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif) { struct sock *sk, *result = NULL; struct hlist_nulls_node *node; int score, hiscore = 0; sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, inet6_ehashfn); if (result) return result; result = sk; hiscore = score; } } return result; } struct sock *inet6_lookup_run_sk_lookup(struct net *net, int protocol, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 hnum, const int dif, inet6_ehashfn_t *ehashfn) { struct sock *sk, *reuse_sk; bool no_reuseport; no_reuseport = bpf_sk_lookup_run_v6(net, protocol, saddr, sport, daddr, hnum, dif, &sk); if (no_reuseport || IS_ERR_OR_NULL(sk)) return sk; reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum, ehashfn); if (reuse_sk) sk = reuse_sk; return sk; } EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup); struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; struct sock *result = NULL; unsigned int hash2; /* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled) && hashinfo == net->ipv4.tcp_death_row.hashinfo) { result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, saddr, sport, daddr, hnum, dif, inet6_ehashfn); if (result) goto done; } hash2 = ipv6_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet6_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, daddr, hnum, dif, sdif); if (result) goto done; /* Lookup lhash2 with in6addr_any */ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); result = inet6_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, &in6addr_any, hnum, dif, sdif); done: if (IS_ERR(result)) return NULL; return result; } EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif) { struct sock *sk; bool refcounted; sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, ntohs(dport), dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; } EXPORT_SYMBOL_GPL(inet6_lookup); static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, const __u16 lport, struct inet_timewait_sock **twp) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr; const struct in6_addr *saddr = &sk->sk_v6_daddr; const int dif = sk->sk_bound_dev_if; struct net *net = sock_net(sk); const int sdif = l3mdev_master_ifindex_by_index(net, dif); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); spinlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { if (sk2->sk_hash != hash) continue; if (likely(inet6_match(net, sk2, saddr, daddr, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); if (twsk_unique(sk, sk2, twp)) break; } goto not_unique; } } /* Must record num and sport now. Otherwise we will see * in hash table socket with a funny identity. */ inet->inet_num = lport; inet->inet_sport = htons(lport); sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule_put(tw); } return 0; not_unique: spin_unlock(lock); return -EADDRNOTAVAIL; } static u64 inet6_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); return secure_ipv6_port_ephemeral(sk->sk_v6_rcv_saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32, inet->inet_dport); } int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { u64 port_offset = 0; if (!inet_sk(sk)->inet_num) port_offset = inet6_sk_port_offset(sk); return __inet_hash_connect(death_row, sk, port_offset, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); int inet6_hash(struct sock *sk) { int err = 0; if (sk->sk_state != TCP_CLOSE) err = __inet_hash(sk, NULL); return err; } EXPORT_SYMBOL_GPL(inet6_hash);
3 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC key management * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * RxRPC keys should have a description of describing their purpose: * "afs@CAMBRIDGE.REDHAT.COM> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/skcipher.h> #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/key-type.h> #include <linux/ctype.h> #include <linux/slab.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include <keys/rxrpc-type.h> #include <keys/user-type.h> #include "ar-internal.h" static int rxrpc_vet_description_s(const char *); static int rxrpc_preparse_s(struct key_preparsed_payload *); static void rxrpc_free_preparse_s(struct key_preparsed_payload *); static void rxrpc_destroy_s(struct key *); static void rxrpc_describe_s(const struct key *, struct seq_file *); /* * rxrpc server keys take "<serviceId>:<securityIndex>[:<sec-specific>]" as the * description and the key material as the payload. */ struct key_type key_type_rxrpc_s = { .name = "rxrpc_s", .flags = KEY_TYPE_NET_DOMAIN, .vet_description = rxrpc_vet_description_s, .preparse = rxrpc_preparse_s, .free_preparse = rxrpc_free_preparse_s, .instantiate = generic_key_instantiate, .destroy = rxrpc_destroy_s, .describe = rxrpc_describe_s, }; /* * Vet the description for an RxRPC server key. */ static int rxrpc_vet_description_s(const char *desc) { unsigned long service, sec_class; char *p; service = simple_strtoul(desc, &p, 10); if (*p != ':' || service > 65535) return -EINVAL; sec_class = simple_strtoul(p + 1, &p, 10); if ((*p && *p != ':') || sec_class < 1 || sec_class > 255) return -EINVAL; return 0; } /* * Preparse a server secret key. */ static int rxrpc_preparse_s(struct key_preparsed_payload *prep) { const struct rxrpc_security *sec; unsigned int service, sec_class; int n; _enter("%zu", prep->datalen); if (!prep->orig_description) return -EINVAL; if (sscanf(prep->orig_description, "%u:%u%n", &service, &sec_class, &n) != 2) return -EINVAL; sec = rxrpc_security_lookup(sec_class); if (!sec) return -ENOPKG; prep->payload.data[1] = (struct rxrpc_security *)sec; if (!sec->preparse_server_key) return -EINVAL; return sec->preparse_server_key(prep); } static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep) { const struct rxrpc_security *sec = prep->payload.data[1]; if (sec && sec->free_preparse_server_key) sec->free_preparse_server_key(prep); } static void rxrpc_destroy_s(struct key *key) { const struct rxrpc_security *sec = key->payload.data[1]; if (sec && sec->destroy_server_key) sec->destroy_server_key(key); } static void rxrpc_describe_s(const struct key *key, struct seq_file *m) { const struct rxrpc_security *sec = key->payload.data[1]; seq_puts(m, key->description); if (sec && sec->describe_server_key) sec->describe_server_key(key, m); } /* * grab the security keyring for a server socket */ int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) { struct key *key; char *description; _enter(""); if (optlen <= 0 || optlen > PAGE_SIZE - 1) return -EINVAL; description = memdup_sockptr_nul(optval, optlen); if (IS_ERR(description)) return PTR_ERR(description); key = request_key(&key_type_keyring, description, NULL); if (IS_ERR(key)) { kfree(description); _leave(" = %ld", PTR_ERR(key)); return PTR_ERR(key); } rx->securities = key; kfree(description); _leave(" = 0 [key %x]", key->serial); return 0; } /** * rxrpc_sock_set_security_keyring - Set the security keyring for a kernel service * @sk: The socket to set the keyring on * @keyring: The keyring to set * * Set the server security keyring on an rxrpc socket. This is used to provide * the encryption keys for a kernel service. */ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) { struct rxrpc_sock *rx = rxrpc_sk(sk); int ret = 0; lock_sock(sk); if (rx->securities) ret = -EINVAL; else if (rx->sk.sk_state != RXRPC_UNBOUND) ret = -EISCONN; else rx->securities = key_get(keyring); release_sock(sk); return ret; } EXPORT_SYMBOL(rxrpc_sock_set_security_keyring);
1 1 1 3 8 6 6 4 4 6 6 6 5 5 4 2 3 2 5 2 2 2 2 5 2 6 6 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 ethtool hooks for cfg80211 * * Copied from cfg.c - originally * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2014 Intel Corporation (Author: Johannes Berg) * Copyright (C) 2018, 2022 Intel Corporation */ #include <linux/types.h> #include <net/cfg80211.h> #include "ieee80211_i.h" #include "sta_info.h" #include "driver-ops.h" static int ieee80211_set_ringparam(struct net_device *dev, struct ethtool_ringparam *rp, struct kernel_ethtool_ringparam *kernel_rp, struct netlink_ext_ack *extack) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0) return -EINVAL; return drv_set_ringparam(local, rp->tx_pending, rp->rx_pending); } static void ieee80211_get_ringparam(struct net_device *dev, struct ethtool_ringparam *rp, struct kernel_ethtool_ringparam *kernel_rp, struct netlink_ext_ack *extack) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); memset(rp, 0, sizeof(*rp)); drv_get_ringparam(local, &rp->tx_pending, &rp->tx_max_pending, &rp->rx_pending, &rp->rx_max_pending); } static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { "rx_packets", "rx_bytes", "rx_duplicates", "rx_fragments", "rx_dropped", "tx_packets", "tx_bytes", "tx_filtered", "tx_retry_failed", "tx_retries", "sta_state", "txrate", "rxrate", "signal", "channel", "noise", "ch_time", "ch_time_busy", "ch_time_ext_busy", "ch_time_rx", "ch_time_tx" }; #define STA_STATS_LEN ARRAY_SIZE(ieee80211_gstrings_sta_stats) static int ieee80211_get_sset_count(struct net_device *dev, int sset) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int rv = 0; if (sset == ETH_SS_STATS) rv += STA_STATS_LEN; rv += drv_get_et_sset_count(sdata, sset); if (rv == 0) return -EOPNOTSUPP; return rv; } static void ieee80211_get_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *channel; struct sta_info *sta; struct ieee80211_local *local = sdata->local; struct station_info sinfo; struct survey_info survey; int i, q; #define STA_STATS_SURVEY_LEN 7 memset(data, 0, sizeof(u64) * STA_STATS_LEN); #define ADD_STA_STATS(sta) \ do { \ data[i++] += sinfo.rx_packets; \ data[i++] += sinfo.rx_bytes; \ data[i++] += (sta)->rx_stats.num_duplicates; \ data[i++] += (sta)->rx_stats.fragments; \ data[i++] += sinfo.rx_dropped_misc; \ \ data[i++] += sinfo.tx_packets; \ data[i++] += sinfo.tx_bytes; \ data[i++] += (sta)->status_stats.filtered; \ data[i++] += sinfo.tx_failed; \ data[i++] += sinfo.tx_retries; \ } while (0) /* For Managed stations, find the single station based on BSSID * and use that. For interface types, iterate through all available * stations and add stats for any station that is assigned to this * network device. */ mutex_lock(&local->sta_mtx); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sta = sta_info_get_bss(sdata, sdata->deflink.u.mgd.bssid); if (!(sta && !WARN_ON(sta->sdata->dev != dev))) goto do_survey; memset(&sinfo, 0, sizeof(sinfo)); sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(&sta->deflink); data[i++] = sta->sta_state; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.txrate); i++; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) data[i] = 100000ULL * cfg80211_calculate_bitrate(&sinfo.rxrate); i++; if (sinfo.filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG)) data[i] = (u8)sinfo.signal_avg; i++; } else { list_for_each_entry(sta, &local->sta_list, list) { /* Make sure this station belongs to the proper dev */ if (sta->sdata->dev != dev) continue; memset(&sinfo, 0, sizeof(sinfo)); sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(&sta->deflink); } } do_survey: i = STA_STATS_LEN - STA_STATS_SURVEY_LEN; /* Get survey stats for current channel */ survey.filled = 0; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (chanctx_conf) channel = chanctx_conf->def.chan; else channel = NULL; rcu_read_unlock(); if (channel) { q = 0; do { survey.filled = 0; if (drv_get_survey(local, q, &survey) != 0) { survey.filled = 0; break; } q++; } while (channel != survey.channel); } if (survey.filled) data[i++] = survey.channel->center_freq; else data[i++] = 0; if (survey.filled & SURVEY_INFO_NOISE_DBM) data[i++] = (u8)survey.noise; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME) data[i++] = survey.time; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_BUSY) data[i++] = survey.time_busy; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_EXT_BUSY) data[i++] = survey.time_ext_busy; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_RX) data[i++] = survey.time_rx; else data[i++] = -1LL; if (survey.filled & SURVEY_INFO_TIME_TX) data[i++] = survey.time_tx; else data[i++] = -1LL; mutex_unlock(&local->sta_mtx); if (WARN_ON(i != STA_STATS_LEN)) return; drv_get_et_stats(sdata, stats, &(data[STA_STATS_LEN])); } static void ieee80211_get_strings(struct net_device *dev, u32 sset, u8 *data) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int sz_sta_stats = 0; if (sset == ETH_SS_STATS) { sz_sta_stats = sizeof(ieee80211_gstrings_sta_stats); memcpy(data, ieee80211_gstrings_sta_stats, sz_sta_stats); } drv_get_et_strings(sdata, sset, &(data[sz_sta_stats])); } static int ieee80211_get_regs_len(struct net_device *dev) { return 0; } static void ieee80211_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *data) { struct wireless_dev *wdev = dev->ieee80211_ptr; regs->version = wdev->wiphy->hw_version; regs->len = 0; } const struct ethtool_ops ieee80211_ethtool_ops = { .get_drvinfo = cfg80211_get_drvinfo, .get_regs_len = ieee80211_get_regs_len, .get_regs = ieee80211_get_regs, .get_link = ethtool_op_get_link, .get_ringparam = ieee80211_get_ringparam, .set_ringparam = ieee80211_set_ringparam, .get_strings = ieee80211_get_strings, .get_ethtool_stats = ieee80211_get_stats, .get_sset_count = ieee80211_get_sset_count, };
973 972 974 1040 973 580 580 397 397 974 974 974 1040 974 972 974 1039 1040 1040 1040 1040 1039 972 1040 1040 973 1040 1040 1040 1040 1385 1386 1383 58 1386 1205 1386 1386 1383 1270 1040 1039 1040 1040 1040 1040 304 362 362 1385 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2011 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/skbuff.h> #include <linux/gfp.h> #include <net/xfrm.h> #include <linux/siphash.h> #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #include <uapi/linux/netfilter/nf_nat.h> #include "nf_internals.h" #define NF_NAT_MAX_ATTEMPTS 128 #define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); static unsigned int nat_net_id __read_mostly; static struct hlist_head *nf_nat_bysource __read_mostly; static unsigned int nf_nat_htable_size __read_mostly; static siphash_aligned_key_t nf_nat_hash_rnd; struct nf_nat_lookup_hook_priv { struct nf_hook_entries __rcu *entries; struct rcu_head rcu_head; }; struct nf_nat_hooks_net { struct nf_hook_ops *nat_hook_ops; unsigned int users; }; struct nat_net { struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; }; #ifdef CONFIG_XFRM static void nf_nat_ipv4_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi4 *fl4 = &fl->u.ip4; if (ct->status & statusbit) { fl4->daddr = t->dst.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl4->saddr = t->src.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_sport = t->src.u.all; } } static void nf_nat_ipv6_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { #if IS_ENABLED(CONFIG_IPV6) const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi6 *fl6 = &fl->u.ip6; if (ct->status & statusbit) { fl6->daddr = t->dst.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl6->saddr = t->src.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_sport = t->src.u.all; } #endif } static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; unsigned long statusbit; u8 family; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return; family = nf_ct_l3num(ct); dir = CTINFO2DIR(ctinfo); if (dir == IP_CT_DIR_ORIGINAL) statusbit = IPS_DST_NAT; else statusbit = IPS_SRC_NAT; switch (family) { case NFPROTO_IPV4: nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); return; case NFPROTO_IPV6: nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); return; } } #endif /* CONFIG_XFRM */ /* We keep an extra hash for each conntrack, for fast searching. */ static unsigned int hash_by_src(const struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { unsigned int hash; struct { struct nf_conntrack_man src; u32 net_mix; u32 protonum; u32 zone; } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); memset(&combined, 0, sizeof(combined)); /* Original src, to ensure we map it consistently if poss. */ combined.src = tuple->src; combined.net_mix = net_hash_mix(net); combined.protonum = tuple->dst.protonum; /* Zone ID can be used provided its valid for both directions */ if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) combined.zone = zone->id; hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); } /* Is this tuple already taken? (not by us) */ static int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { /* Conntrack tracking doesn't keep track of outgoing tuples; only * incoming ones. NAT means they don't have a fixed mapping, * so we invert the tuple and look for the incoming reply. * * We could keep a separate hash if this proves too slow. */ struct nf_conntrack_tuple reply; nf_ct_invert_tuple(&reply, tuple); return nf_conntrack_tuple_taken(&reply, ignored_conntrack); } static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) { static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | IPS_DYING; static const unsigned long flags_needed = IPS_SRC_NAT; enum tcp_conntrack old_state; old_state = READ_ONCE(ct->proto.tcp.state); if (old_state < TCP_CONNTRACK_TIME_WAIT) return false; if (flags & flags_refuse) return false; return (flags & flags_needed) == flags_needed; } /* reverse direction will send packets to new source, so * make sure such packets are invalid. */ static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) { return (__s32)(new->proto.tcp.seen[0].td_end - old->proto.tcp.seen[0].td_end) > 0; } static int nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack, unsigned int attempts_left) { static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple reply; unsigned long flags; struct nf_conn *ct; bool taken = true; struct net *net; nf_ct_invert_tuple(&reply, tuple); if (attempts_left > NF_NAT_HARDER_THRESH || tuple->dst.protonum != IPPROTO_TCP || ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) return nf_conntrack_tuple_taken(&reply, ignored_conntrack); /* :ast few attempts to find a free tcp port. Destructive * action: evict colliding if its in timewait state and the * tcp sequence number has advanced past the one used by the * old entry. */ net = nf_ct_net(ignored_conntrack); zone = nf_ct_zone(ignored_conntrack); thash = nf_conntrack_find_get(net, zone, &reply); if (!thash) return false; ct = nf_ct_tuplehash_to_ctrack(thash); if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) goto out; if (WARN_ON_ONCE(ct == ignored_conntrack)) goto out; flags = READ_ONCE(ct->status); if (!nf_nat_may_kill(ct, flags)) goto out; if (!nf_seq_has_advanced(ct, ignored_conntrack)) goto out; /* Even if we can evict do not reuse if entry is offloaded. */ if (nf_ct_kill(ct)) taken = flags & flags_offload; out: nf_ct_put(ct); return taken; } static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, const struct nf_nat_range2 *range) { if (t->src.l3num == NFPROTO_IPV4) return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; } /* Is the manipable part of the tuple between min and max incl? */ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min, const union nf_conntrack_man_proto *max) { __be16 port; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); case IPPROTO_GRE: /* all fall though */ case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_DCCP: case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) port = tuple->src.u.all; else port = tuple->dst.u.all; return ntohs(port) >= ntohs(min->all) && ntohs(port) <= ntohs(max->all); default: return true; } } /* If we source map this tuple so reply looks like reply_tuple, will * that meet the constraints of range. */ static int nf_in_range(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the * range specified, otherwise let this drag us onto a new src IP. */ if (range->flags & NF_NAT_RANGE_MAP_IPS && !nf_nat_inet_in_range(tuple, range)) return 0; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) return 1; return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, &range->min_proto, &range->max_proto); } static inline int same_src(const struct nf_conn *ct, const struct nf_conntrack_tuple *tuple) { const struct nf_conntrack_tuple *t; t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; return (t->dst.protonum == tuple->dst.protonum && nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && t->src.u.all == tuple->src.u.all); } /* Only called for SRC manip */ static int find_appropriate_src(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, const struct nf_nat_range2 *range) { unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn *ct; hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { if (same_src(ct, tuple) && net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuple(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); result->dst = tuple->dst; if (nf_in_range(result, range)) return 1; } } return 0; } /* For [FUTURE] fragmentation handling, we want the least-used * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports * 1-65535, we don't do pro-rata allocation based on ports; we choose * the ip with the lowest src-ip/dst-ip/proto usage. */ static void find_best_ips_proto(const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) { union nf_inet_addr *var_ipp; unsigned int i, max; /* Host order */ u32 minip, maxip, j, dist; bool full_range; /* No IP mapping? Do nothing. */ if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) return; if (maniptype == NF_NAT_MANIP_SRC) var_ipp = &tuple->src.u3; else var_ipp = &tuple->dst.u3; /* Fast path: only one choice. */ if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { *var_ipp = range->min_addr; return; } if (nf_ct_l3num(ct) == NFPROTO_IPV4) max = sizeof(var_ipp->ip) / sizeof(u32) - 1; else max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; /* Hashing source and destination IPs gives a fairly even * spread in practice (if there are a small number of IPs * involved, there usually aren't that many connections * anyway). The consistency means that servers see the same * client coming from the same IP (some Internet Banking sites * like this), even across reboots. */ j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); full_range = false; for (i = 0; i <= max; i++) { /* If first bytes of the address are at the maximum, use the * distance. Otherwise use the full range. */ if (!full_range) { minip = ntohl((__force __be32)range->min_addr.all[i]); maxip = ntohl((__force __be32)range->max_addr.all[i]); dist = maxip - minip + 1; } else { minip = 0; dist = ~0; } var_ipp->all[i] = (__force __u32) htonl(minip + reciprocal_scale(j, dist)); if (var_ipp->all[i] != range->max_addr.all[i]) full_range = true; if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) j ^= (__force u32)tuple->dst.u3.all[i]; } } /* Alter the per-proto part of the tuple (depending on maniptype), to * give a unique tuple in the given range if possible. * * Per-protocol part of tuple is initialized to the incoming packet. */ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { unsigned int range_size, min, max, i, attempts; __be16 *keyptr; u16 off; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: /* id is same for either direction... */ keyptr = &tuple->src.u.icmp.id; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 0; range_size = 65536; } else { min = ntohs(range->min_proto.icmp.id); range_size = ntohs(range->max_proto.icmp.id) - ntohs(range->min_proto.icmp.id) + 1; } goto find_free_id; #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) case IPPROTO_GRE: /* If there is no master conntrack we are not PPTP, do not change tuples */ if (!ct->master) return; if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.gre.key; else keyptr = &tuple->dst.u.gre.key; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 1; range_size = 65535; } else { min = ntohs(range->min_proto.gre.key); range_size = ntohs(range->max_proto.gre.key) - min + 1; } goto find_free_id; #endif case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: case IPPROTO_DCCP: if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.all; else keyptr = &tuple->dst.u.all; break; default: return; } /* If no range specified... */ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { /* If it's dst rewrite, can't change port */ if (maniptype == NF_NAT_MANIP_DST) return; if (ntohs(*keyptr) < 1024) { /* Loose convention: >> 512 is credential passing */ if (ntohs(*keyptr) < 512) { min = 1; range_size = 511 - min + 1; } else { min = 600; range_size = 1023 - min + 1; } } else { min = 1024; range_size = 65535 - 1024 + 1; } } else { min = ntohs(range->min_proto.all); max = ntohs(range->max_proto.all); if (unlikely(max < min)) swap(max, min); range_size = max - min + 1; } find_free_id: if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); else off = get_random_u16(); attempts = range_size; if (attempts > NF_NAT_MAX_ATTEMPTS) attempts = NF_NAT_MAX_ATTEMPTS; /* We are in softirq; doing a search of the entire range risks * soft lockup when all tuples are already used. * * If we can't find any free port from first offset, pick a new * one and try again, with ever smaller search window. */ another_round: for (i = 0; i < attempts; i++, off++) { *keyptr = htons(min + off % range_size); if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) return; } if (attempts >= range_size || attempts < 16) return; attempts /= 2; off = get_random_u16(); goto another_round; } /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, * we change the source to map into the range. For NF_INET_PRE_ROUTING * and NF_INET_LOCAL_OUT, we change the destination to map into the * range. It might not be possible to get a unique tuple, but we try. * At worst (or if we race), we will end up with a final duplicate in * __nf_conntrack_confirm and drop the packet. */ static void get_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig_tuple, const struct nf_nat_range2 *range, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); zone = nf_ct_zone(ct); /* 1) If this srcip/proto/src-proto-part is currently mapped, * and that same mapping gives a unique tuple within the given * range, use that. * * This is only required for source (ie. NAT/masq) mappings. * So far, we don't do local source mappings, so multiple * manips not an issue. */ if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ if (nf_in_range(orig_tuple, range)) { if (!nf_nat_used_tuple(orig_tuple, ct)) { *tuple = *orig_tuple; return; } } else if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; } } /* 2) Select the least-used IP/proto combination in the given range */ *tuple = *orig_tuple; find_best_ips_proto(zone, tuple, range, ct, maniptype); /* 3) The per-protocol part of the manip is made to map into * the range to make a unique tuple. */ /* Only bother mapping if it's not already in range and unique */ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, &range->min_proto, &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) return; } else if (!nf_nat_used_tuple(tuple, ct)) { return; } } /* Last chance: get protocol to try to obtain unique tuple. */ nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); } struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) { struct nf_conn_nat *nat = nfct_nat(ct); if (nat) return nat; if (!nf_ct_is_confirmed(ct)) nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); return nat; } EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; /* Can't setup nat info for confirmed ct. */ if (nf_ct_is_confirmed(ct)) return NF_ACCEPT; WARN_ON(maniptype != NF_NAT_MANIP_SRC && maniptype != NF_NAT_MANIP_DST); if (WARN_ON(nf_nat_initialized(ct, maniptype))) return NF_DROP; /* What we've got will look like inverse of reply. Normally * this is what is in the conntrack, except for prior * manipulations (future optimization: if num_manips == 0, * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ nf_ct_invert_tuple(&curr_tuple, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { struct nf_conntrack_tuple reply; /* Alter conntrack table so will recognize replies. */ nf_ct_invert_tuple(&reply, &new_tuple); nf_conntrack_alter_reply(ct, &reply); /* Non-atomic: we own this at the moment. */ if (maniptype == NF_NAT_MANIP_SRC) ct->status |= IPS_SRC_NAT; else ct->status |= IPS_DST_NAT; if (nfct_help(ct) && !nfct_seqadj(ct)) if (!nfct_seqadj_ext_add(ct)) return NF_DROP; } if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; spinlock_t *lock; srchash = hash_by_src(net, nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); spin_unlock_bh(lock); } /* It's done. */ if (maniptype == NF_NAT_MANIP_DST) ct->status |= IPS_DST_NAT_DONE; else ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; } EXPORT_SYMBOL(nf_nat_setup_info); static unsigned int __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) { /* Force range to this IP; let proto decide mapping for * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). * Use reply in case it's already been mangled (eg local packet). */ union nf_inet_addr ip = (manip == NF_NAT_MANIP_SRC ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); struct nf_nat_range2 range = { .flags = NF_NAT_RANGE_MAP_IPS, .min_addr = ip, .max_addr = ip, }; return nf_nat_setup_info(ct, &range, manip); } unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) { return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); } EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); /* Do packet manipulations according to nf_nat_setup_info. */ unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, struct sk_buff *skb) { enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); unsigned int verdict = NF_ACCEPT; unsigned long statusbit; if (mtype == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply dir. */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; /* Non-atomic: these bits don't change. */ if (ct->status & statusbit) verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); return verdict; } EXPORT_SYMBOL_GPL(nf_nat_packet); static bool in_vrf_postrouting(const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (state->hook == NF_INET_POST_ROUTING && netif_is_l3_master(state->out)) return true; #endif return false; } unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ if (!ct || in_vrf_postrouting(state)) return NF_ACCEPT; nat = nfct_nat(ct); switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED_REPLY: /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. */ if (!nf_nat_initialized(ct, maniptype)) { struct nf_nat_lookup_hook_priv *lpriv = priv; struct nf_hook_entries *e = rcu_dereference(lpriv->entries); unsigned int ret; int i; if (!e) goto null_bind; for (i = 0; i < e->num_hook_entries; i++) { ret = e->hooks[i].hook(e->hooks[i].priv, skb, state); if (ret != NF_ACCEPT) return ret; if (nf_nat_initialized(ct, maniptype)) goto do_nat; } null_bind: ret = nf_nat_alloc_null_binding(ct, state->hook); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct, ct->status); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } break; default: /* ESTABLISHED */ WARN_ON(ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } do_nat: return nf_nat_packet(ct, ctinfo, state->hook, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); return NF_DROP; } EXPORT_SYMBOL_GPL(nf_nat_inet_fn); struct nf_nat_proto_clean { u8 l3proto; u8 l4proto; }; /* kill conntracks with affected NAT section */ static int nf_nat_proto_remove(struct nf_conn *i, void *data) { const struct nf_nat_proto_clean *clean = data; if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) return 0; return i->status & IPS_NAT_MASK ? 1 : 0; } static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { unsigned int h; h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); } static int nf_nat_proto_clean(struct nf_conn *ct, void *data) { if (nf_nat_proto_remove(ct, data)) return 1; /* This module is being removed and conntrack has nat null binding. * Remove it from bysource hash, as the table will be freed soon. * * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */ if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) nf_nat_cleanup_conntrack(ct); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. */ return 0; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, }; static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_PROTONAT_PORT_MIN]) { range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); range->max_proto.all = range->min_proto.all; range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[CTA_PROTONAT_PORT_MAX]) { range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } return 0; } static int nfnetlink_parse_nat_proto(struct nlattr *attr, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_PROTONAT_MAX+1]; int err; err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy, NULL); if (err < 0) return err; return nf_nat_l4proto_nlattr_to_range(tb, range); } static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_PROTO] = { .type = NLA_NESTED }, }; static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V4_MINIP]) { range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); range->flags |= NF_NAT_RANGE_MAP_IPS; } if (tb[CTA_NAT_V4_MAXIP]) range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); else range->max_addr.ip = range->min_addr.ip; return 0; } static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V6_MINIP]) { nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], sizeof(struct in6_addr)); range->flags |= NF_NAT_RANGE_MAP_IPS; } if (tb[CTA_NAT_V6_MAXIP]) nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], sizeof(struct in6_addr)); else range->max_addr = range->min_addr; return 0; } static int nfnetlink_parse_nat(const struct nlattr *nat, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_NAT_MAX+1]; int err; memset(range, 0, sizeof(*range)); err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat, nat_nla_policy, NULL); if (err < 0) return err; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: err = nf_nat_ipv4_nlattr_to_range(tb, range); break; case NFPROTO_IPV6: err = nf_nat_ipv6_nlattr_to_range(tb, range); break; default: err = -EPROTONOSUPPORT; break; } if (err) return err; if (!tb[CTA_NAT_PROTO]) return 0; return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); } /* This function is called under rcu_read_lock() */ static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { struct nf_nat_range2 range; int err; /* Should not happen, restricted to creating new conntracks * via ctnetlink. */ if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) return -EEXIST; /* No NAT information has been passed, allocate the null-binding */ if (attr == NULL) return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; err = nfnetlink_parse_nat(attr, ct, &range); if (err < 0) return err; return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; } #else static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { return -EOPNOTSUPP; } #endif static struct nf_ct_helper_expectfn follow_master_nat = { .name = "nat-follow-master", .expectfn = nf_nat_follow_master, }; int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; unsigned int hooknum = ops->hooknum; struct nf_hook_ops *nat_ops; int i, ret; if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) return -EINVAL; nat_proto_net = &nat_net->nat_proto_net[pf]; for (i = 0; i < ops_count; i++) { if (orig_nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) return -EINVAL; mutex_lock(&nf_nat_proto_mutex); if (!nat_proto_net->nat_hook_ops) { WARN_ON(nat_proto_net->users != 0); nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL); if (!nat_ops) { mutex_unlock(&nf_nat_proto_mutex); return -ENOMEM; } for (i = 0; i < ops_count; i++) { priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv) { nat_ops[i].priv = priv; continue; } mutex_unlock(&nf_nat_proto_mutex); while (i) kfree(nat_ops[--i].priv); kfree(nat_ops); return -ENOMEM; } ret = nf_register_net_hooks(net, nat_ops, ops_count); if (ret < 0) { mutex_unlock(&nf_nat_proto_mutex); for (i = 0; i < ops_count; i++) kfree(nat_ops[i].priv); kfree(nat_ops); return ret; } nat_proto_net->nat_hook_ops = nat_ops; } nat_ops = nat_proto_net->nat_hook_ops; priv = nat_ops[hooknum].priv; if (WARN_ON_ONCE(!priv)) { mutex_unlock(&nf_nat_proto_mutex); return -EOPNOTSUPP; } ret = nf_hook_entries_insert_raw(&priv->entries, ops); if (ret == 0) nat_proto_net->users++; mutex_unlock(&nf_nat_proto_mutex); return ret; } void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; struct nf_hook_ops *nat_ops; int hooknum = ops->hooknum; int i; if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) return; nat_proto_net = &nat_net->nat_proto_net[pf]; mutex_lock(&nf_nat_proto_mutex); if (WARN_ON(nat_proto_net->users == 0)) goto unlock; nat_proto_net->users--; nat_ops = nat_proto_net->nat_hook_ops; for (i = 0; i < ops_count; i++) { if (nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) goto unlock; priv = nat_ops[hooknum].priv; nf_hook_entries_delete_raw(&priv->entries, ops); if (nat_proto_net->users == 0) { nf_unregister_net_hooks(net, nat_ops, ops_count); for (i = 0; i < ops_count; i++) { priv = nat_ops[i].priv; kfree_rcu(priv, rcu_head); } nat_proto_net->nat_hook_ops = NULL; kfree(nat_ops); } unlock: mutex_unlock(&nf_nat_proto_mutex); } static struct pernet_operations nat_net_ops = { .id = &nat_net_id, .size = sizeof(struct nat_net), }; static const struct nf_nat_hook nat_hook = { .parse_nat_setup = nfnetlink_parse_nat_setup, #ifdef CONFIG_XFRM .decode_session = __nf_nat_decode_session, #endif .manip_pkt = nf_nat_manip_pkt, .remove_nat_bysrc = nf_nat_cleanup_conntrack, }; static int __init nf_nat_init(void) { int ret, i; /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; if (nf_nat_htable_size < CONNTRACK_LOCKS) nf_nat_htable_size = CONNTRACK_LOCKS; nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) return -ENOMEM; for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); ret = register_pernet_subsys(&nat_net_ops); if (ret < 0) { kvfree(nf_nat_bysource); return ret; } nf_ct_helper_expectfn_register(&follow_master_nat); WARN_ON(nf_nat_hook != NULL); RCU_INIT_POINTER(nf_nat_hook, &nat_hook); ret = register_nf_nat_bpf(); if (ret < 0) { RCU_INIT_POINTER(nf_nat_hook, NULL); nf_ct_helper_expectfn_unregister(&follow_master_nat); synchronize_net(); unregister_pernet_subsys(&nat_net_ops); kvfree(nf_nat_bysource); } return ret; } static void __exit nf_nat_cleanup(void) { struct nf_nat_proto_clean clean = {}; nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); nf_ct_helper_expectfn_unregister(&follow_master_nat); RCU_INIT_POINTER(nf_nat_hook, NULL); synchronize_net(); kvfree(nf_nat_bysource); unregister_pernet_subsys(&nat_net_ops); } MODULE_LICENSE("GPL"); module_init(nf_nat_init); module_exit(nf_nat_cleanup);
2 2 2 2 2 2 2 1755 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* isotp.c - ISO 15765-2 CAN transport protocol for protocol family CAN * * This implementation does not provide ISO-TP specific return values to the * userspace. * * - RX path timeout of data reception leads to -ETIMEDOUT * - RX path SN mismatch leads to -EILSEQ * - RX path data reception with wrong padding leads to -EBADMSG * - TX path flowcontrol reception timeout leads to -ECOMM * - TX path flowcontrol reception overflow leads to -EMSGSIZE * - TX path flowcontrol reception with wrong layout/padding leads to -EBADMSG * - when a transfer (tx) is on the run the next write() blocks until it's done * - use CAN_ISOTP_WAIT_TX_DONE flag to block the caller until the PDU is sent * - as we have static buffers the check whether the PDU fits into the buffer * is done at FF reception time (no support for sending 'wait frames') * * Copyright (c) 2020 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include <linux/module.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/hrtimer.h> #include <linux/wait.h> #include <linux/uio.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/isotp.h> #include <linux/slab.h> #include <net/sock.h> #include <net/net_namespace.h> MODULE_DESCRIPTION("PF_CAN isotp 15765-2:2016 protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <socketcan@hartkopp.net>"); MODULE_ALIAS("can-proto-6"); #define ISOTP_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.tp) #define SINGLE_MASK(id) (((id) & CAN_EFF_FLAG) ? \ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) /* ISO 15765-2:2016 supports more than 4095 byte per ISO PDU as the FF_DL can * take full 32 bit values (4 Gbyte). We would need some good concept to handle * this between user space and kernel space. For now set the static buffer to * something about 8 kbyte to be able to test this new functionality. */ #define DEFAULT_MAX_PDU_SIZE 8300 /* maximum PDU size before ISO 15765-2:2016 extension was 4095 */ #define MAX_12BIT_PDU_SIZE 4095 /* limit the isotp pdu size from the optional module parameter to 1MByte */ #define MAX_PDU_SIZE (1025 * 1024U) static unsigned int max_pdu_size __read_mostly = DEFAULT_MAX_PDU_SIZE; module_param(max_pdu_size, uint, 0444); MODULE_PARM_DESC(max_pdu_size, "maximum isotp pdu size (default " __stringify(DEFAULT_MAX_PDU_SIZE) ")"); /* N_PCI type values in bits 7-4 of N_PCI bytes */ #define N_PCI_SF 0x00 /* single frame */ #define N_PCI_FF 0x10 /* first frame */ #define N_PCI_CF 0x20 /* consecutive frame */ #define N_PCI_FC 0x30 /* flow control */ #define N_PCI_SZ 1 /* size of the PCI byte #1 */ #define SF_PCI_SZ4 1 /* size of SingleFrame PCI including 4 bit SF_DL */ #define SF_PCI_SZ8 2 /* size of SingleFrame PCI including 8 bit SF_DL */ #define FF_PCI_SZ12 2 /* size of FirstFrame PCI including 12 bit FF_DL */ #define FF_PCI_SZ32 6 /* size of FirstFrame PCI including 32 bit FF_DL */ #define FC_CONTENT_SZ 3 /* flow control content size in byte (FS/BS/STmin) */ #define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA) #define ISOTP_ALL_BC_FLAGS (CAN_ISOTP_SF_BROADCAST | CAN_ISOTP_CF_BROADCAST) /* Flow Status given in FC frame */ #define ISOTP_FC_CTS 0 /* clear to send */ #define ISOTP_FC_WT 1 /* wait */ #define ISOTP_FC_OVFLW 2 /* overflow */ #define ISOTP_FC_TIMEOUT 1 /* 1 sec */ #define ISOTP_ECHO_TIMEOUT 2 /* 2 secs */ enum { ISOTP_IDLE = 0, ISOTP_WAIT_FIRST_FC, ISOTP_WAIT_FC, ISOTP_WAIT_DATA, ISOTP_SENDING, ISOTP_SHUTDOWN, }; struct tpcon { u8 *buf; unsigned int buflen; unsigned int len; unsigned int idx; u32 state; u8 bs; u8 sn; u8 ll_dl; u8 sbuf[DEFAULT_MAX_PDU_SIZE]; }; struct isotp_sock { struct sock sk; int bound; int ifindex; canid_t txid; canid_t rxid; ktime_t tx_gap; ktime_t lastrxcf_tstamp; struct hrtimer rxtimer, txtimer, txfrtimer; struct can_isotp_options opt; struct can_isotp_fc_options rxfc, txfc; struct can_isotp_ll_options ll; u32 frame_txtime; u32 force_tx_stmin; u32 force_rx_stmin; u32 cfecho; /* consecutive frame echo tag */ struct tpcon rx, tx; struct list_head notifier; wait_queue_head_t wait; spinlock_t rx_lock; /* protect single thread state machine */ }; static LIST_HEAD(isotp_notifier_list); static DEFINE_SPINLOCK(isotp_notifier_lock); static struct isotp_sock *isotp_busy_notifier; static inline struct isotp_sock *isotp_sk(const struct sock *sk) { return (struct isotp_sock *)sk; } static u32 isotp_bc_flags(struct isotp_sock *so) { return so->opt.flags & ISOTP_ALL_BC_FLAGS; } static bool isotp_register_rxid(struct isotp_sock *so) { /* no broadcast modes => register rx_id for FC frame reception */ return (isotp_bc_flags(so) == 0); } static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, rxtimer); struct sock *sk = &so->sk; if (so->rx.state == ISOTP_WAIT_DATA) { /* we did not get new data frames in time */ /* report 'connection timed out' */ sk->sk_err = ETIMEDOUT; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; } return HRTIMER_NORESTART; } static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) { struct net_device *dev; struct sk_buff *nskb; struct canfd_frame *ncf; struct isotp_sock *so = isotp_sk(sk); int can_send_ret; nskb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), gfp_any()); if (!nskb) return 1; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { kfree_skb(nskb); return 1; } can_skb_reserve(nskb); can_skb_prv(nskb)->ifindex = dev->ifindex; can_skb_prv(nskb)->skbcnt = 0; nskb->dev = dev; can_skb_set_owner(nskb, sk); ncf = (struct canfd_frame *)nskb->data; skb_put_zero(nskb, so->ll.mtu); /* create & send flow control reply */ ncf->can_id = so->txid; if (so->opt.flags & CAN_ISOTP_TX_PADDING) { memset(ncf->data, so->opt.txpad_content, CAN_MAX_DLEN); ncf->len = CAN_MAX_DLEN; } else { ncf->len = ae + FC_CONTENT_SZ; } ncf->data[ae] = N_PCI_FC | flowstatus; ncf->data[ae + 1] = so->rxfc.bs; ncf->data[ae + 2] = so->rxfc.stmin; if (ae) ncf->data[0] = so->opt.ext_address; ncf->flags = so->ll.tx_flags; can_send_ret = can_send(nskb, 1); if (can_send_ret) pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(can_send_ret)); dev_put(dev); /* reset blocksize counter */ so->rx.bs = 0; /* reset last CF frame rx timestamp for rx stmin enforcement */ so->lastrxcf_tstamp = ktime_set(0, 0); /* start rx timeout watchdog */ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return 0; } static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk) { struct sockaddr_can *addr = (struct sockaddr_can *)skb->cb; BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can)); memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = skb->dev->ifindex; if (sock_queue_rcv_skb(sk, skb) < 0) kfree_skb(skb); } static u8 padlen(u8 datalen) { static const u8 plen[] = { 8, 8, 8, 8, 8, 8, 8, 8, 8, /* 0 - 8 */ 12, 12, 12, 12, /* 9 - 12 */ 16, 16, 16, 16, /* 13 - 16 */ 20, 20, 20, 20, /* 17 - 20 */ 24, 24, 24, 24, /* 21 - 24 */ 32, 32, 32, 32, 32, 32, 32, 32, /* 25 - 32 */ 48, 48, 48, 48, 48, 48, 48, 48, /* 33 - 40 */ 48, 48, 48, 48, 48, 48, 48, 48 /* 41 - 48 */ }; if (datalen > 48) return 64; return plen[datalen]; } /* check for length optimization and return 1/true when the check fails */ static int check_optimized(struct canfd_frame *cf, int start_index) { /* for CAN_DL <= 8 the start_index is equal to the CAN_DL as the * padding would start at this point. E.g. if the padding would * start at cf.data[7] cf->len has to be 7 to be optimal. * Note: The data[] index starts with zero. */ if (cf->len <= CAN_MAX_DLEN) return (cf->len != start_index); /* This relation is also valid in the non-linear DLC range, where * we need to take care of the minimal next possible CAN_DL. * The correct check would be (padlen(cf->len) != padlen(start_index)). * But as cf->len can only take discrete values from 12, .., 64 at this * point the padlen(cf->len) is always equal to cf->len. */ return (cf->len != padlen(start_index)); } /* check padding and return 1/true when the check fails */ static int check_pad(struct isotp_sock *so, struct canfd_frame *cf, int start_index, u8 content) { int i; /* no RX_PADDING value => check length of optimized frame length */ if (!(so->opt.flags & CAN_ISOTP_RX_PADDING)) { if (so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) return check_optimized(cf, start_index); /* no valid test against empty value => ignore frame */ return 1; } /* check datalength of correctly padded CAN frame */ if ((so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) && cf->len != padlen(cf->len)) return 1; /* check padding content */ if (so->opt.flags & CAN_ISOTP_CHK_PAD_DATA) { for (i = start_index; i < cf->len; i++) if (cf->data[i] != content) return 1; } return 0; } static void isotp_send_cframe(struct isotp_sock *so); static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) { struct sock *sk = &so->sk; if (so->tx.state != ISOTP_WAIT_FC && so->tx.state != ISOTP_WAIT_FIRST_FC) return 0; hrtimer_cancel(&so->txtimer); if ((cf->len < ae + FC_CONTENT_SZ) || ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, ae + FC_CONTENT_SZ, so->opt.rxpad_content))) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return 1; } /* get communication parameters only from the first FC frame */ if (so->tx.state == ISOTP_WAIT_FIRST_FC) { so->txfc.bs = cf->data[ae + 1]; so->txfc.stmin = cf->data[ae + 2]; /* fix wrong STmin values according spec */ if (so->txfc.stmin > 0x7F && (so->txfc.stmin < 0xF1 || so->txfc.stmin > 0xF9)) so->txfc.stmin = 0x7F; so->tx_gap = ktime_set(0, 0); /* add transmission time for CAN frame N_As */ so->tx_gap = ktime_add_ns(so->tx_gap, so->frame_txtime); /* add waiting time for consecutive frames N_Cs */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_add_ns(so->tx_gap, so->force_tx_stmin); else if (so->txfc.stmin < 0x80) so->tx_gap = ktime_add_ns(so->tx_gap, so->txfc.stmin * 1000000); else so->tx_gap = ktime_add_ns(so->tx_gap, (so->txfc.stmin - 0xF0) * 100000); so->tx.state = ISOTP_WAIT_FC; } switch (cf->data[ae] & 0x0F) { case ISOTP_FC_CTS: so->tx.bs = 0; so->tx.state = ISOTP_SENDING; /* send CF frame and enable echo timeout handling */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); isotp_send_cframe(so); break; case ISOTP_FC_WT: /* start timer to wait for next FC frame */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); break; case ISOTP_FC_OVFLW: /* overflow on receiver side - report 'message too long' */ sk->sk_err = EMSGSIZE; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); fallthrough; default: /* stop this tx job */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); } return 0; } static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen, struct sk_buff *skb, int len) { struct isotp_sock *so = isotp_sk(sk); struct sk_buff *nskb; hrtimer_cancel(&so->rxtimer); so->rx.state = ISOTP_IDLE; if (!len || len > cf->len - pcilen) return 1; if ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, pcilen + len, so->opt.rxpad_content)) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); return 1; } nskb = alloc_skb(len, gfp_any()); if (!nskb) return 1; memcpy(skb_put(nskb, len), &cf->data[pcilen], len); nskb->tstamp = skb->tstamp; nskb->dev = skb->dev; isotp_rcv_skb(nskb, sk); return 0; } static int isotp_rcv_ff(struct sock *sk, struct canfd_frame *cf, int ae) { struct isotp_sock *so = isotp_sk(sk); int i; int off; int ff_pci_sz; hrtimer_cancel(&so->rxtimer); so->rx.state = ISOTP_IDLE; /* get the used sender LL_DL from the (first) CAN frame data length */ so->rx.ll_dl = padlen(cf->len); /* the first frame has to use the entire frame up to LL_DL length */ if (cf->len != so->rx.ll_dl) return 1; /* get the FF_DL */ so->rx.len = (cf->data[ae] & 0x0F) << 8; so->rx.len += cf->data[ae + 1]; /* Check for FF_DL escape sequence supporting 32 bit PDU length */ if (so->rx.len) { ff_pci_sz = FF_PCI_SZ12; } else { /* FF_DL = 0 => get real length from next 4 bytes */ so->rx.len = cf->data[ae + 2] << 24; so->rx.len += cf->data[ae + 3] << 16; so->rx.len += cf->data[ae + 4] << 8; so->rx.len += cf->data[ae + 5]; ff_pci_sz = FF_PCI_SZ32; } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->rx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; if (so->rx.len + ae + off + ff_pci_sz < so->rx.ll_dl) return 1; /* PDU size > default => try max_pdu_size */ if (so->rx.len > so->rx.buflen && so->rx.buflen < max_pdu_size) { u8 *newbuf = kmalloc(max_pdu_size, GFP_ATOMIC); if (newbuf) { so->rx.buf = newbuf; so->rx.buflen = max_pdu_size; } } if (so->rx.len > so->rx.buflen) { /* send FC frame with overflow status */ isotp_send_fc(sk, ae, ISOTP_FC_OVFLW); return 1; } /* copy the first received data bytes */ so->rx.idx = 0; for (i = ae + ff_pci_sz; i < so->rx.ll_dl; i++) so->rx.buf[so->rx.idx++] = cf->data[i]; /* initial setup for this pdu reception */ so->rx.sn = 1; so->rx.state = ISOTP_WAIT_DATA; /* no creation of flow control frames */ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) return 0; /* send our first FC frame */ isotp_send_fc(sk, ae, ISOTP_FC_CTS); return 0; } static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae, struct sk_buff *skb) { struct isotp_sock *so = isotp_sk(sk); struct sk_buff *nskb; int i; if (so->rx.state != ISOTP_WAIT_DATA) return 0; /* drop if timestamp gap is less than force_rx_stmin nano secs */ if (so->opt.flags & CAN_ISOTP_FORCE_RXSTMIN) { if (ktime_to_ns(ktime_sub(skb->tstamp, so->lastrxcf_tstamp)) < so->force_rx_stmin) return 0; so->lastrxcf_tstamp = skb->tstamp; } hrtimer_cancel(&so->rxtimer); /* CFs are never longer than the FF */ if (cf->len > so->rx.ll_dl) return 1; /* CFs have usually the LL_DL length */ if (cf->len < so->rx.ll_dl) { /* this is only allowed for the last CF */ if (so->rx.len - so->rx.idx > so->rx.ll_dl - ae - N_PCI_SZ) return 1; } if ((cf->data[ae] & 0x0F) != so->rx.sn) { /* wrong sn detected - report 'illegal byte sequence' */ sk->sk_err = EILSEQ; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset rx state */ so->rx.state = ISOTP_IDLE; return 1; } so->rx.sn++; so->rx.sn %= 16; for (i = ae + N_PCI_SZ; i < cf->len; i++) { so->rx.buf[so->rx.idx++] = cf->data[i]; if (so->rx.idx >= so->rx.len) break; } if (so->rx.idx >= so->rx.len) { /* we are done */ so->rx.state = ISOTP_IDLE; if ((so->opt.flags & ISOTP_CHECK_PADDING) && check_pad(so, cf, i + 1, so->opt.rxpad_content)) { /* malformed PDU - report 'not a data message' */ sk->sk_err = EBADMSG; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); return 1; } nskb = alloc_skb(so->rx.len, gfp_any()); if (!nskb) return 1; memcpy(skb_put(nskb, so->rx.len), so->rx.buf, so->rx.len); nskb->tstamp = skb->tstamp; nskb->dev = skb->dev; isotp_rcv_skb(nskb, sk); return 0; } /* perform blocksize handling, if enabled */ if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) { /* start rx timeout watchdog */ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return 0; } /* no creation of flow control frames */ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE) return 0; /* we reached the specified blocksize so->rxfc.bs */ isotp_send_fc(sk, ae, ISOTP_FC_CTS); return 0; } static void isotp_rcv(struct sk_buff *skb, void *data) { struct sock *sk = (struct sock *)data; struct isotp_sock *so = isotp_sk(sk); struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; u8 n_pci_type, sf_dl; /* Strictly receive only frames with the configured MTU size * => clear separation of CAN2.0 / CAN FD transport channels */ if (skb->len != so->ll.mtu) return; cf = (struct canfd_frame *)skb->data; /* if enabled: check reception of my configured extended address */ if (ae && cf->data[0] != so->opt.rx_ext_address) return; n_pci_type = cf->data[ae] & 0xF0; /* Make sure the state changes and data structures stay consistent at * CAN frame reception time. This locking is not needed in real world * use cases but the inconsistency can be triggered with syzkaller. */ spin_lock(&so->rx_lock); if (so->opt.flags & CAN_ISOTP_HALF_DUPLEX) { /* check rx/tx path half duplex expectations */ if ((so->tx.state != ISOTP_IDLE && n_pci_type != N_PCI_FC) || (so->rx.state != ISOTP_IDLE && n_pci_type == N_PCI_FC)) goto out_unlock; } switch (n_pci_type) { case N_PCI_FC: /* tx path: flow control frame containing the FC parameters */ isotp_rcv_fc(so, cf, ae); break; case N_PCI_SF: /* rx path: single frame * * As we do not have a rx.ll_dl configuration, we can only test * if the CAN frames payload length matches the LL_DL == 8 * requirements - no matter if it's CAN 2.0 or CAN FD */ /* get the SF_DL from the N_PCI byte */ sf_dl = cf->data[ae] & 0x0F; if (cf->len <= CAN_MAX_DLEN) { isotp_rcv_sf(sk, cf, SF_PCI_SZ4 + ae, skb, sf_dl); } else { if (can_is_canfd_skb(skb)) { /* We have a CAN FD frame and CAN_DL is greater than 8: * Only frames with the SF_DL == 0 ESC value are valid. * * If so take care of the increased SF PCI size * (SF_PCI_SZ8) to point to the message content behind * the extended SF PCI info and get the real SF_DL * length value from the formerly first data byte. */ if (sf_dl == 0) isotp_rcv_sf(sk, cf, SF_PCI_SZ8 + ae, skb, cf->data[SF_PCI_SZ4 + ae]); } } break; case N_PCI_FF: /* rx path: first frame */ isotp_rcv_ff(sk, cf, ae); break; case N_PCI_CF: /* rx path: consecutive frame */ isotp_rcv_cf(sk, cf, ae, skb); break; } out_unlock: spin_unlock(&so->rx_lock); } static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so, int ae, int off) { int pcilen = N_PCI_SZ + ae + off; int space = so->tx.ll_dl - pcilen; int num = min_t(int, so->tx.len - so->tx.idx, space); int i; cf->can_id = so->txid; cf->len = num + pcilen; if (num < space) { if (so->opt.flags & CAN_ISOTP_TX_PADDING) { /* user requested padding */ cf->len = padlen(cf->len); memset(cf->data, so->opt.txpad_content, cf->len); } else if (cf->len > CAN_MAX_DLEN) { /* mandatory padding for CAN FD frames */ cf->len = padlen(cf->len); memset(cf->data, CAN_ISOTP_DEFAULT_PAD_CONTENT, cf->len); } } for (i = 0; i < num; i++) cf->data[pcilen + i] = so->tx.buf[so->tx.idx++]; if (ae) cf->data[0] = so->opt.ext_address; } static void isotp_send_cframe(struct isotp_sock *so) { struct sock *sk = &so->sk; struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; int can_send_ret; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) return; skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), GFP_ATOMIC); if (!skb) { dev_put(dev); return; } can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; cf = (struct canfd_frame *)skb->data; skb_put_zero(skb, so->ll.mtu); /* create consecutive frame */ isotp_fill_dataframe(cf, so, ae, 0); /* place consecutive frame N_PCI in appropriate index */ cf->data[ae] = N_PCI_CF | so->tx.sn++; so->tx.sn %= 16; so->tx.bs++; cf->flags = so->ll.tx_flags; skb->dev = dev; can_skb_set_owner(skb, sk); /* cfecho should have been zero'ed by init/isotp_rcv_echo() */ if (so->cfecho) pr_notice_once("can-isotp: cfecho is %08X != 0\n", so->cfecho); /* set consecutive frame echo tag */ so->cfecho = *(u32 *)cf->data; /* send frame with local echo enabled */ can_send_ret = can_send(skb, 1); if (can_send_ret) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(can_send_ret)); if (can_send_ret == -ENOBUFS) pr_notice_once("can-isotp: tx queue is full\n"); } dev_put(dev); } static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so, int ae) { int i; int ff_pci_sz; cf->can_id = so->txid; cf->len = so->tx.ll_dl; if (ae) cf->data[0] = so->opt.ext_address; /* create N_PCI bytes with 12/32 bit FF_DL data length */ if (so->tx.len > MAX_12BIT_PDU_SIZE) { /* use 32 bit FF_DL notation */ cf->data[ae] = N_PCI_FF; cf->data[ae + 1] = 0; cf->data[ae + 2] = (u8)(so->tx.len >> 24) & 0xFFU; cf->data[ae + 3] = (u8)(so->tx.len >> 16) & 0xFFU; cf->data[ae + 4] = (u8)(so->tx.len >> 8) & 0xFFU; cf->data[ae + 5] = (u8)so->tx.len & 0xFFU; ff_pci_sz = FF_PCI_SZ32; } else { /* use 12 bit FF_DL notation */ cf->data[ae] = (u8)(so->tx.len >> 8) | N_PCI_FF; cf->data[ae + 1] = (u8)so->tx.len & 0xFFU; ff_pci_sz = FF_PCI_SZ12; } /* add first data bytes depending on ae */ for (i = ae + ff_pci_sz; i < so->tx.ll_dl; i++) cf->data[i] = so->tx.buf[so->tx.idx++]; so->tx.sn = 1; } static void isotp_rcv_echo(struct sk_buff *skb, void *data) { struct sock *sk = (struct sock *)data; struct isotp_sock *so = isotp_sk(sk); struct canfd_frame *cf = (struct canfd_frame *)skb->data; /* only handle my own local echo CF/SF skb's (no FF!) */ if (skb->sk != sk || so->cfecho != *(u32 *)cf->data) return; /* cancel local echo timeout */ hrtimer_cancel(&so->txtimer); /* local echo skb with consecutive frame has been consumed */ so->cfecho = 0; if (so->tx.idx >= so->tx.len) { /* we are done */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return; } if (so->txfc.bs && so->tx.bs >= so->txfc.bs) { /* stop and wait for FC with timeout */ so->tx.state = ISOTP_WAIT_FC; hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); return; } /* no gap between data frames needed => use burst mode */ if (!so->tx_gap) { /* enable echo timeout handling */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); isotp_send_cframe(so); return; } /* start timer to send next consecutive frame with correct delay */ hrtimer_start(&so->txfrtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT); } static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txtimer); struct sock *sk = &so->sk; /* don't handle timeouts in IDLE or SHUTDOWN state */ if (so->tx.state == ISOTP_IDLE || so->tx.state == ISOTP_SHUTDOWN) return HRTIMER_NORESTART; /* we did not get any flow control or echo frame in time */ /* report 'communication error on send' */ sk->sk_err = ECOMM; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); /* reset tx state */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return HRTIMER_NORESTART; } static enum hrtimer_restart isotp_txfr_timer_handler(struct hrtimer *hrtimer) { struct isotp_sock *so = container_of(hrtimer, struct isotp_sock, txfrtimer); /* start echo timeout handling and cover below protocol error */ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0), HRTIMER_MODE_REL_SOFT); /* cfecho should be consumed by isotp_rcv_echo() here */ if (so->tx.state == ISOTP_SENDING && !so->cfecho) isotp_send_cframe(so); return HRTIMER_NORESTART; } static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0; s64 hrtimer_sec = ISOTP_ECHO_TIMEOUT; int off; int err; if (!so->bound || so->tx.state == ISOTP_SHUTDOWN) return -EADDRNOTAVAIL; wait_free_buffer: /* we do not support multiple buffers - for now */ if (wq_has_sleeper(&so->wait) && (msg->msg_flags & MSG_DONTWAIT)) return -EAGAIN; /* wait for complete transmission of current pdu */ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); if (err) goto err_event_drop; if (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE) { if (so->tx.state == ISOTP_SHUTDOWN) return -EADDRNOTAVAIL; goto wait_free_buffer; } /* PDU size > default => try max_pdu_size */ if (size > so->tx.buflen && so->tx.buflen < max_pdu_size) { u8 *newbuf = kmalloc(max_pdu_size, GFP_KERNEL); if (newbuf) { so->tx.buf = newbuf; so->tx.buflen = max_pdu_size; } } if (!size || size > so->tx.buflen) { err = -EINVAL; goto err_out_drop; } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; /* does the given data fit into a single frame for SF_BROADCAST? */ if ((isotp_bc_flags(so) == CAN_ISOTP_SF_BROADCAST) && (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) { err = -EINVAL; goto err_out_drop; } err = memcpy_from_msg(so->tx.buf, msg, size); if (err < 0) goto err_out_drop; dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { err = -ENXIO; goto err_out_drop; } skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) { dev_put(dev); goto err_out_drop; } can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; so->tx.len = size; so->tx.idx = 0; cf = (struct canfd_frame *)skb->data; skb_put_zero(skb, so->ll.mtu); /* cfecho should have been zero'ed by init / former isotp_rcv_echo() */ if (so->cfecho) pr_notice_once("can-isotp: uninit cfecho %08X\n", so->cfecho); /* check for single frame transmission depending on TX_DL */ if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) { /* The message size generally fits into a SingleFrame - good. * * SF_DL ESC offset optimization: * * When TX_DL is greater 8 but the message would still fit * into a 8 byte CAN frame, we can omit the offset. * This prevents a protocol caused length extension from * CAN_DL = 8 to CAN_DL = 12 due to the SF_SL ESC handling. */ if (size <= CAN_MAX_DLEN - SF_PCI_SZ4 - ae) off = 0; isotp_fill_dataframe(cf, so, ae, off); /* place single frame N_PCI w/o length in appropriate index */ cf->data[ae] = N_PCI_SF; /* place SF_DL size value depending on the SF_DL ESC offset */ if (off) cf->data[SF_PCI_SZ4 + ae] = size; else cf->data[ae] |= size; /* set CF echo tag for isotp_rcv_echo() (SF-mode) */ so->cfecho = *(u32 *)cf->data; } else { /* send first frame */ isotp_create_fframe(cf, so, ae); if (isotp_bc_flags(so) == CAN_ISOTP_CF_BROADCAST) { /* set timer for FC-less operation (STmin = 0) */ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN) so->tx_gap = ktime_set(0, so->force_tx_stmin); else so->tx_gap = ktime_set(0, so->frame_txtime); /* disable wait for FCs due to activated block size */ so->txfc.bs = 0; /* set CF echo tag for isotp_rcv_echo() (CF-mode) */ so->cfecho = *(u32 *)cf->data; } else { /* standard flow control check */ so->tx.state = ISOTP_WAIT_FIRST_FC; /* start timeout for FC */ hrtimer_sec = ISOTP_FC_TIMEOUT; /* no CF echo tag for isotp_rcv_echo() (FF-mode) */ so->cfecho = 0; } } hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0), HRTIMER_MODE_REL_SOFT); /* send the first or only CAN frame */ cf->flags = so->ll.tx_flags; skb->dev = dev; skb->sk = sk; err = can_send(skb, 1); dev_put(dev); if (err) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(err)); /* no transmission -> no timeout monitoring */ hrtimer_cancel(&so->txtimer); /* reset consecutive frame echo tag */ so->cfecho = 0; goto err_out_drop; } if (wait_tx_done) { /* wait for complete transmission of current pdu */ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); if (err) goto err_event_drop; err = sock_error(sk); if (err) return err; } return size; err_event_drop: /* got signal: force tx state machine to be idle */ so->tx.state = ISOTP_IDLE; hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); err_out_drop: /* drop this PDU and unlock a potential wait queue */ so->tx.state = ISOTP_IDLE; wake_up_interruptible(&so->wait); return err; } static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; struct isotp_sock *so = isotp_sk(sk); int ret = 0; if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK | MSG_CMSG_COMPAT)) return -EINVAL; if (!so->bound) return -EADDRNOTAVAIL; skb = skb_recv_datagram(sk, flags, &ret); if (!skb) return ret; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; ret = memcpy_to_msg(msg, skb->data, size); if (ret < 0) goto out_err; sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(ISOTP_MIN_NAMELEN); msg->msg_namelen = ISOTP_MIN_NAMELEN; memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } /* set length of return value */ ret = (flags & MSG_TRUNC) ? skb->len : size; out_err: skb_free_datagram(sk, skb); return ret; } static int isotp_release(struct socket *sock) { struct sock *sk = sock->sk; struct isotp_sock *so; struct net *net; if (!sk) return 0; so = isotp_sk(sk); net = sock_net(sk); /* wait for complete transmission of current pdu */ while (wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE) == 0 && cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SHUTDOWN) != ISOTP_IDLE) ; /* force state machines to be idle also when a signal occurred */ so->tx.state = ISOTP_SHUTDOWN; so->rx.state = ISOTP_IDLE; spin_lock(&isotp_notifier_lock); while (isotp_busy_notifier == so) { spin_unlock(&isotp_notifier_lock); schedule_timeout_uninterruptible(1); spin_lock(&isotp_notifier_lock); } list_del(&so->notifier); spin_unlock(&isotp_notifier_lock); lock_sock(sk); /* remove current filters & unregister */ if (so->bound) { if (so->ifindex) { struct net_device *dev; dev = dev_get_by_index(net, so->ifindex); if (dev) { if (isotp_register_rxid(so)) can_rx_unregister(net, dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); can_rx_unregister(net, dev, so->txid, SINGLE_MASK(so->txid), isotp_rcv_echo, sk); dev_put(dev); synchronize_rcu(); } } } hrtimer_cancel(&so->txfrtimer); hrtimer_cancel(&so->txtimer); hrtimer_cancel(&so->rxtimer); so->ifindex = 0; so->bound = 0; if (so->rx.buf != so->rx.sbuf) kfree(so->rx.buf); if (so->tx.buf != so->tx.sbuf) kfree(so->tx.buf); sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); struct net *net = sock_net(sk); int ifindex; struct net_device *dev; canid_t tx_id = addr->can_addr.tp.tx_id; canid_t rx_id = addr->can_addr.tp.rx_id; int err = 0; int notify_enetdown = 0; if (len < ISOTP_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; /* sanitize tx CAN identifier */ if (tx_id & CAN_EFF_FLAG) tx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); else tx_id &= CAN_SFF_MASK; /* give feedback on wrong CAN-ID value */ if (tx_id != addr->can_addr.tp.tx_id) return -EINVAL; /* sanitize rx CAN identifier (if needed) */ if (isotp_register_rxid(so)) { if (rx_id & CAN_EFF_FLAG) rx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK); else rx_id &= CAN_SFF_MASK; /* give feedback on wrong CAN-ID value */ if (rx_id != addr->can_addr.tp.rx_id) return -EINVAL; } if (!addr->can_ifindex) return -ENODEV; lock_sock(sk); if (so->bound) { err = -EINVAL; goto out; } /* ensure different CAN IDs when the rx_id is to be registered */ if (isotp_register_rxid(so) && rx_id == tx_id) { err = -EADDRNOTAVAIL; goto out; } dev = dev_get_by_index(net, addr->can_ifindex); if (!dev) { err = -ENODEV; goto out; } if (dev->type != ARPHRD_CAN) { dev_put(dev); err = -ENODEV; goto out; } if (dev->mtu < so->ll.mtu) { dev_put(dev); err = -EINVAL; goto out; } if (!(dev->flags & IFF_UP)) notify_enetdown = 1; ifindex = dev->ifindex; if (isotp_register_rxid(so)) can_rx_register(net, dev, rx_id, SINGLE_MASK(rx_id), isotp_rcv, sk, "isotp", sk); /* no consecutive frame echo skb in flight */ so->cfecho = 0; /* register for echo skb's */ can_rx_register(net, dev, tx_id, SINGLE_MASK(tx_id), isotp_rcv_echo, sk, "isotpe", sk); dev_put(dev); /* switch to new settings */ so->ifindex = ifindex; so->rxid = rx_id; so->txid = tx_id; so->bound = 1; out: release_sock(sk); if (notify_enetdown) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } return err; } static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); if (peer) return -EOPNOTSUPP; memset(addr, 0, ISOTP_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = so->ifindex; addr->can_addr.tp.rx_id = so->rxid; addr->can_addr.tp.tx_id = so->txid; return ISOTP_MIN_NAMELEN; } static int isotp_setsockopt_locked(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); int ret = 0; if (so->bound) return -EISCONN; switch (optname) { case CAN_ISOTP_OPTS: if (optlen != sizeof(struct can_isotp_options)) return -EINVAL; if (copy_from_sockptr(&so->opt, optval, optlen)) return -EFAULT; /* no separate rx_ext_address is given => use ext_address */ if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR)) so->opt.rx_ext_address = so->opt.ext_address; /* these broadcast flags are not allowed together */ if (isotp_bc_flags(so) == ISOTP_ALL_BC_FLAGS) { /* CAN_ISOTP_SF_BROADCAST is prioritized */ so->opt.flags &= ~CAN_ISOTP_CF_BROADCAST; /* give user feedback on wrong config attempt */ ret = -EINVAL; } /* check for frame_txtime changes (0 => no changes) */ if (so->opt.frame_txtime) { if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO) so->frame_txtime = 0; else so->frame_txtime = so->opt.frame_txtime; } break; case CAN_ISOTP_RECV_FC: if (optlen != sizeof(struct can_isotp_fc_options)) return -EINVAL; if (copy_from_sockptr(&so->rxfc, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_TX_STMIN: if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&so->force_tx_stmin, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_RX_STMIN: if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&so->force_rx_stmin, optval, optlen)) return -EFAULT; break; case CAN_ISOTP_LL_OPTS: if (optlen == sizeof(struct can_isotp_ll_options)) { struct can_isotp_ll_options ll; if (copy_from_sockptr(&ll, optval, optlen)) return -EFAULT; /* check for correct ISO 11898-1 DLC data length */ if (ll.tx_dl != padlen(ll.tx_dl)) return -EINVAL; if (ll.mtu != CAN_MTU && ll.mtu != CANFD_MTU) return -EINVAL; if (ll.mtu == CAN_MTU && (ll.tx_dl > CAN_MAX_DLEN || ll.tx_flags != 0)) return -EINVAL; memcpy(&so->ll, &ll, sizeof(ll)); /* set ll_dl for tx path to similar place as for rx */ so->tx.ll_dl = ll.tx_dl; } else { return -EINVAL; } break; default: ret = -ENOPROTOOPT; } return ret; } static int isotp_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int ret; if (level != SOL_CAN_ISOTP) return -EINVAL; lock_sock(sk); ret = isotp_setsockopt_locked(sock, level, optname, optval, optlen); release_sock(sk); return ret; } static int isotp_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); int len; void *val; if (level != SOL_CAN_ISOTP) return -EINVAL; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case CAN_ISOTP_OPTS: len = min_t(int, len, sizeof(struct can_isotp_options)); val = &so->opt; break; case CAN_ISOTP_RECV_FC: len = min_t(int, len, sizeof(struct can_isotp_fc_options)); val = &so->rxfc; break; case CAN_ISOTP_TX_STMIN: len = min_t(int, len, sizeof(u32)); val = &so->force_tx_stmin; break; case CAN_ISOTP_RX_STMIN: len = min_t(int, len, sizeof(u32)); val = &so->force_rx_stmin; break; case CAN_ISOTP_LL_OPTS: len = min_t(int, len, sizeof(struct can_isotp_ll_options)); val = &so->ll; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, val, len)) return -EFAULT; return 0; } static void isotp_notify(struct isotp_sock *so, unsigned long msg, struct net_device *dev) { struct sock *sk = &so->sk; if (!net_eq(dev_net(dev), sock_net(sk))) return; if (so->ifindex != dev->ifindex) return; switch (msg) { case NETDEV_UNREGISTER: lock_sock(sk); /* remove current filters & unregister */ if (so->bound) { if (isotp_register_rxid(so)) can_rx_unregister(dev_net(dev), dev, so->rxid, SINGLE_MASK(so->rxid), isotp_rcv, sk); can_rx_unregister(dev_net(dev), dev, so->txid, SINGLE_MASK(so->txid), isotp_rcv_echo, sk); } so->ifindex = 0; so->bound = 0; release_sock(sk); sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; case NETDEV_DOWN: sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); break; } } static int isotp_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) return NOTIFY_DONE; if (unlikely(isotp_busy_notifier)) /* Check for reentrant bug. */ return NOTIFY_DONE; spin_lock(&isotp_notifier_lock); list_for_each_entry(isotp_busy_notifier, &isotp_notifier_list, notifier) { spin_unlock(&isotp_notifier_lock); isotp_notify(isotp_busy_notifier, msg, dev); spin_lock(&isotp_notifier_lock); } isotp_busy_notifier = NULL; spin_unlock(&isotp_notifier_lock); return NOTIFY_DONE; } static int isotp_init(struct sock *sk) { struct isotp_sock *so = isotp_sk(sk); so->ifindex = 0; so->bound = 0; so->opt.flags = CAN_ISOTP_DEFAULT_FLAGS; so->opt.ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; so->opt.rx_ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS; so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT; so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME; so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS; so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN; so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX; so->ll.mtu = CAN_ISOTP_DEFAULT_LL_MTU; so->ll.tx_dl = CAN_ISOTP_DEFAULT_LL_TX_DL; so->ll.tx_flags = CAN_ISOTP_DEFAULT_LL_TX_FLAGS; /* set ll_dl for tx path to similar place as for rx */ so->tx.ll_dl = so->ll.tx_dl; so->rx.state = ISOTP_IDLE; so->tx.state = ISOTP_IDLE; so->rx.buf = so->rx.sbuf; so->tx.buf = so->tx.sbuf; so->rx.buflen = ARRAY_SIZE(so->rx.sbuf); so->tx.buflen = ARRAY_SIZE(so->tx.sbuf); hrtimer_init(&so->rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->rxtimer.function = isotp_rx_timer_handler; hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->txtimer.function = isotp_tx_timer_handler; hrtimer_init(&so->txfrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); so->txfrtimer.function = isotp_txfr_timer_handler; init_waitqueue_head(&so->wait); spin_lock_init(&so->rx_lock); spin_lock(&isotp_notifier_lock); list_add_tail(&so->notifier, &isotp_notifier_list); spin_unlock(&isotp_notifier_lock); return 0; } static __poll_t isotp_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); __poll_t mask = datagram_poll(file, sock, wait); poll_wait(file, &so->wait, wait); /* Check for false positives due to TX state */ if ((mask & EPOLLWRNORM) && (so->tx.state != ISOTP_IDLE)) mask &= ~(EPOLLOUT | EPOLLWRNORM); return mask; } static int isotp_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops isotp_ops = { .family = PF_CAN, .release = isotp_release, .bind = isotp_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = isotp_getname, .poll = isotp_poll, .ioctl = isotp_sock_no_ioctlcmd, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = isotp_setsockopt, .getsockopt = isotp_getsockopt, .sendmsg = isotp_sendmsg, .recvmsg = isotp_recvmsg, .mmap = sock_no_mmap, }; static struct proto isotp_proto __read_mostly = { .name = "CAN_ISOTP", .owner = THIS_MODULE, .obj_size = sizeof(struct isotp_sock), .init = isotp_init, }; static const struct can_proto isotp_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_ISOTP, .ops = &isotp_ops, .prot = &isotp_proto, }; static struct notifier_block canisotp_notifier = { .notifier_call = isotp_notifier }; static __init int isotp_module_init(void) { int err; max_pdu_size = max_t(unsigned int, max_pdu_size, MAX_12BIT_PDU_SIZE); max_pdu_size = min_t(unsigned int, max_pdu_size, MAX_PDU_SIZE); pr_info("can: isotp protocol (max_pdu_size %d)\n", max_pdu_size); err = can_proto_register(&isotp_can_proto); if (err < 0) pr_err("can: registration of isotp protocol failed %pe\n", ERR_PTR(err)); else register_netdevice_notifier(&canisotp_notifier); return err; } static __exit void isotp_module_exit(void) { can_proto_unregister(&isotp_can_proto); unregister_netdevice_notifier(&canisotp_notifier); } module_init(isotp_module_init); module_exit(isotp_module_exit);
94 94 31 116 93 53 2 3 48 93 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _IPV6_FRAG_H #define _IPV6_FRAG_H #include <linux/icmpv6.h> #include <linux/kernel.h> #include <net/addrconf.h> #include <net/ipv6.h> #include <net/inet_frag.h> enum ip6_defrag_users { IP6_DEFRAG_LOCAL_DELIVER, IP6_DEFRAG_CONNTRACK_IN, __IP6_DEFRAG_CONNTRACK_IN = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX, IP6_DEFRAG_CONNTRACK_OUT, __IP6_DEFRAG_CONNTRACK_OUT = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX, IP6_DEFRAG_CONNTRACK_BRIDGE_IN, __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, }; /* * Equivalent of ipv4 struct ip */ struct frag_queue { struct inet_frag_queue q; int iif; __u16 nhoffset; u8 ecn; }; #if IS_ENABLED(CONFIG_IPV6) static inline void ip6frag_init(struct inet_frag_queue *q, const void *a) { struct frag_queue *fq = container_of(q, struct frag_queue, q); const struct frag_v6_compare_key *key = a; q->key.v6 = *key; fq->ecn = 0; } static inline u32 ip6frag_key_hashfn(const void *data, u32 len, u32 seed) { return jhash2(data, sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); } static inline u32 ip6frag_obj_hashfn(const void *data, u32 len, u32 seed) { const struct inet_frag_queue *fq = data; return jhash2((const u32 *)&fq->key.v6, sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); } static inline int ip6frag_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) { const struct frag_v6_compare_key *key = arg->key; const struct inet_frag_queue *fq = ptr; return !!memcmp(&fq->key, key, sizeof(*key)); } static inline void ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) { struct net_device *dev = NULL; struct sk_buff *head; rcu_read_lock(); /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */ if (READ_ONCE(fq->q.fqdir->dead)) goto out_rcu_unlock; spin_lock(&fq->q.lock); if (fq->q.flags & INET_FRAG_COMPLETE) goto out; fq->q.flags |= INET_FRAG_DROP; inet_frag_kill(&fq->q); dev = dev_get_by_index_rcu(net, fq->iif); if (!dev) goto out; __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); /* Don't send error if the first segment did not arrive. */ if (!(fq->q.flags & INET_FRAG_FIRST_IN)) goto out; /* sk_buff::dev and sk_buff::rbnode are unionized. So we * pull the head out of the tree in order to be able to * deal with head->dev. */ head = inet_frag_pull_head(&fq->q); if (!head) goto out; head->dev = dev; spin_unlock(&fq->q.lock); icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT); goto out_rcu_unlock; out: spin_unlock(&fq->q.lock); out_rcu_unlock: rcu_read_unlock(); inet_frag_put(&fq->q); } /* Check if the upper layer header is truncated in the first fragment. */ static inline bool ipv6frag_thdr_truncated(struct sk_buff *skb, int start, u8 *nexthdrp) { u8 nexthdr = *nexthdrp; __be16 frag_off; int offset; offset = ipv6_skip_exthdr(skb, start, &nexthdr, &frag_off); if (offset < 0 || (frag_off & htons(IP6_OFFSET))) return false; switch (nexthdr) { case NEXTHDR_TCP: offset += sizeof(struct tcphdr); break; case NEXTHDR_UDP: offset += sizeof(struct udphdr); break; case NEXTHDR_ICMP: offset += sizeof(struct icmp6hdr); break; default: offset += 1; } if (offset > skb->len) return true; return false; } #endif #endif
1049 1042 136 136 1761 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 // SPDX-License-Identifier: GPL-2.0-only /* * Monitoring code for network dropped packet alerts * * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> #include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/inet.h> #include <linux/interrupt.h> #include <linux/netpoll.h> #include <linux/sched.h> #include <linux/delay.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/netlink.h> #include <linux/net_dropmon.h> #include <linux/bitfield.h> #include <linux/percpu.h> #include <linux/timer.h> #include <linux/bitops.h> #include <linux/slab.h> #include <linux/module.h> #include <net/genetlink.h> #include <net/netevent.h> #include <net/flow_offload.h> #include <net/dropreason.h> #include <net/devlink.h> #include <trace/events/skb.h> #include <trace/events/napi.h> #include <trace/events/devlink.h> #include <asm/unaligned.h> #define TRACE_ON 1 #define TRACE_OFF 0 /* * Globals, our netlink socket pointer * and the work handle that will send up * netlink alerts */ static int trace_state = TRACE_OFF; static bool monitor_hw; /* net_dm_mutex * * An overall lock guarding every operation coming from userspace. */ static DEFINE_MUTEX(net_dm_mutex); struct net_dm_stats { u64_stats_t dropped; struct u64_stats_sync syncp; }; #define NET_DM_MAX_HW_TRAP_NAME_LEN 40 struct net_dm_hw_entry { char trap_name[NET_DM_MAX_HW_TRAP_NAME_LEN]; u32 count; }; struct net_dm_hw_entries { u32 num_entries; struct net_dm_hw_entry entries[]; }; struct per_cpu_dm_data { spinlock_t lock; /* Protects 'skb', 'hw_entries' and * 'send_timer' */ union { struct sk_buff *skb; struct net_dm_hw_entries *hw_entries; }; struct sk_buff_head drop_queue; struct work_struct dm_alert_work; struct timer_list send_timer; struct net_dm_stats stats; }; struct dm_hw_stat_delta { unsigned long last_rx; unsigned long last_drop_val; struct rcu_head rcu; }; static struct genl_family net_drop_monitor_family; static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data); static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data); static int dm_hit_limit = 64; static int dm_delay = 1; static unsigned long dm_hw_check_delta = 2*HZ; static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY; static u32 net_dm_trunc_len; static u32 net_dm_queue_len = 1000; struct net_dm_alert_ops { void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb, void *location, enum skb_drop_reason reason); void (*napi_poll_probe)(void *ignore, struct napi_struct *napi, int work, int budget); void (*work_item_func)(struct work_struct *work); void (*hw_work_item_func)(struct work_struct *work); void (*hw_trap_probe)(void *ignore, const struct devlink *devlink, struct sk_buff *skb, const struct devlink_trap_metadata *metadata); }; struct net_dm_skb_cb { union { struct devlink_trap_metadata *hw_metadata; void *pc; }; enum skb_drop_reason reason; }; #define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0])) static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data) { size_t al; struct net_dm_alert_msg *msg; struct nlattr *nla; struct sk_buff *skb; unsigned long flags; void *msg_header; al = sizeof(struct net_dm_alert_msg); al += dm_hit_limit * sizeof(struct net_dm_drop_point); al += sizeof(struct nlattr); skb = genlmsg_new(al, GFP_KERNEL); if (!skb) goto err; msg_header = genlmsg_put(skb, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_ALERT); if (!msg_header) { nlmsg_free(skb); skb = NULL; goto err; } nla = nla_reserve(skb, NLA_UNSPEC, sizeof(struct net_dm_alert_msg)); if (!nla) { nlmsg_free(skb); skb = NULL; goto err; } msg = nla_data(nla); memset(msg, 0, al); goto out; err: mod_timer(&data->send_timer, jiffies + HZ / 10); out: spin_lock_irqsave(&data->lock, flags); swap(data->skb, skb); spin_unlock_irqrestore(&data->lock, flags); if (skb) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; struct genlmsghdr *gnlh = (struct genlmsghdr *)nlmsg_data(nlh); genlmsg_end(skb, genlmsg_data(gnlh)); } return skb; } static const struct genl_multicast_group dropmon_mcgrps[] = { { .name = "events", }, }; static void send_dm_alert(struct work_struct *work) { struct sk_buff *skb; struct per_cpu_dm_data *data; data = container_of(work, struct per_cpu_dm_data, dm_alert_work); skb = reset_per_cpu_data(data); if (skb) genlmsg_multicast(&net_drop_monitor_family, skb, 0, 0, GFP_KERNEL); } /* * This is the timer function to delay the sending of an alert * in the event that more drops will arrive during the * hysteresis period. */ static void sched_send_work(struct timer_list *t) { struct per_cpu_dm_data *data = from_timer(data, t, send_timer); schedule_work(&data->dm_alert_work); } static void trace_drop_common(struct sk_buff *skb, void *location) { struct net_dm_alert_msg *msg; struct net_dm_drop_point *point; struct nlmsghdr *nlh; struct nlattr *nla; int i; struct sk_buff *dskb; struct per_cpu_dm_data *data; unsigned long flags; local_irq_save(flags); data = this_cpu_ptr(&dm_cpu_data); spin_lock(&data->lock); dskb = data->skb; if (!dskb) goto out; nlh = (struct nlmsghdr *)dskb->data; nla = genlmsg_data(nlmsg_data(nlh)); msg = nla_data(nla); point = msg->points; for (i = 0; i < msg->entries; i++) { if (!memcmp(&location, &point->pc, sizeof(void *))) { point->count++; goto out; } point++; } if (msg->entries == dm_hit_limit) goto out; /* * We need to create a new entry */ __nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point)); nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point)); memcpy(point->pc, &location, sizeof(void *)); point->count = 1; msg->entries++; if (!timer_pending(&data->send_timer)) { data->send_timer.expires = jiffies + dm_delay * HZ; add_timer(&data->send_timer); } out: spin_unlock_irqrestore(&data->lock, flags); } static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location, enum skb_drop_reason reason) { trace_drop_common(skb, location); } static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi, int work, int budget) { struct net_device *dev = napi->dev; struct dm_hw_stat_delta *stat; /* * Don't check napi structures with no associated device */ if (!dev) return; rcu_read_lock(); stat = rcu_dereference(dev->dm_private); if (stat) { /* * only add a note to our monitor buffer if: * 1) its after the last_rx delta * 2) our rx_dropped count has gone up */ if (time_after(jiffies, stat->last_rx + dm_hw_check_delta) && (dev->stats.rx_dropped != stat->last_drop_val)) { trace_drop_common(NULL, NULL); stat->last_drop_val = dev->stats.rx_dropped; stat->last_rx = jiffies; } } rcu_read_unlock(); } static struct net_dm_hw_entries * net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data) { struct net_dm_hw_entries *hw_entries; unsigned long flags; hw_entries = kzalloc(struct_size(hw_entries, entries, dm_hit_limit), GFP_KERNEL); if (!hw_entries) { /* If the memory allocation failed, we try to perform another * allocation in 1/10 second. Otherwise, the probe function * will constantly bail out. */ mod_timer(&hw_data->send_timer, jiffies + HZ / 10); } spin_lock_irqsave(&hw_data->lock, flags); swap(hw_data->hw_entries, hw_entries); spin_unlock_irqrestore(&hw_data->lock, flags); return hw_entries; } static int net_dm_hw_entry_put(struct sk_buff *msg, const struct net_dm_hw_entry *hw_entry) { struct nlattr *attr; attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRY); if (!attr) return -EMSGSIZE; if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_entry->trap_name)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_HW_TRAP_COUNT, hw_entry->count)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_hw_entries_put(struct sk_buff *msg, const struct net_dm_hw_entries *hw_entries) { struct nlattr *attr; int i; attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRIES); if (!attr) return -EMSGSIZE; for (i = 0; i < hw_entries->num_entries; i++) { int rc; rc = net_dm_hw_entry_put(msg, &hw_entries->entries[i]); if (rc) goto nla_put_failure; } nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_hw_summary_report_fill(struct sk_buff *msg, const struct net_dm_hw_entries *hw_entries) { struct net_dm_alert_msg anc_hdr = { 0 }; void *hdr; int rc; hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_ALERT); if (!hdr) return -EMSGSIZE; /* We need to put the ancillary header in order not to break user * space. */ if (nla_put(msg, NLA_UNSPEC, sizeof(anc_hdr), &anc_hdr)) goto nla_put_failure; rc = net_dm_hw_entries_put(msg, hw_entries); if (rc) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void net_dm_hw_summary_work(struct work_struct *work) { struct net_dm_hw_entries *hw_entries; struct per_cpu_dm_data *hw_data; struct sk_buff *msg; int rc; hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work); hw_entries = net_dm_hw_reset_per_cpu_data(hw_data); if (!hw_entries) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) goto out; rc = net_dm_hw_summary_report_fill(msg, hw_entries); if (rc) { nlmsg_free(msg); goto out; } genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); out: kfree(hw_entries); } static void net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink, struct sk_buff *skb, const struct devlink_trap_metadata *metadata) { struct net_dm_hw_entries *hw_entries; struct net_dm_hw_entry *hw_entry; struct per_cpu_dm_data *hw_data; unsigned long flags; int i; if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL) return; hw_data = this_cpu_ptr(&dm_hw_cpu_data); spin_lock_irqsave(&hw_data->lock, flags); hw_entries = hw_data->hw_entries; if (!hw_entries) goto out; for (i = 0; i < hw_entries->num_entries; i++) { hw_entry = &hw_entries->entries[i]; if (!strncmp(hw_entry->trap_name, metadata->trap_name, NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) { hw_entry->count++; goto out; } } if (WARN_ON_ONCE(hw_entries->num_entries == dm_hit_limit)) goto out; hw_entry = &hw_entries->entries[hw_entries->num_entries]; strscpy(hw_entry->trap_name, metadata->trap_name, NET_DM_MAX_HW_TRAP_NAME_LEN - 1); hw_entry->count = 1; hw_entries->num_entries++; if (!timer_pending(&hw_data->send_timer)) { hw_data->send_timer.expires = jiffies + dm_delay * HZ; add_timer(&hw_data->send_timer); } out: spin_unlock_irqrestore(&hw_data->lock, flags); } static const struct net_dm_alert_ops net_dm_alert_summary_ops = { .kfree_skb_probe = trace_kfree_skb_hit, .napi_poll_probe = trace_napi_poll_hit, .work_item_func = send_dm_alert, .hw_work_item_func = net_dm_hw_summary_work, .hw_trap_probe = net_dm_hw_trap_summary_probe, }; static void net_dm_packet_trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location, enum skb_drop_reason reason) { ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *data; struct net_dm_skb_cb *cb; struct sk_buff *nskb; unsigned long flags; if (!skb_mac_header_was_set(skb)) return; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return; cb = NET_DM_SKB_CB(nskb); cb->reason = reason; cb->pc = location; /* Override the timestamp because we care about the time when the * packet was dropped. */ nskb->tstamp = tstamp; data = this_cpu_ptr(&dm_cpu_data); spin_lock_irqsave(&data->drop_queue.lock, flags); if (skb_queue_len(&data->drop_queue) < net_dm_queue_len) __skb_queue_tail(&data->drop_queue, nskb); else goto unlock_free; spin_unlock_irqrestore(&data->drop_queue.lock, flags); schedule_work(&data->dm_alert_work); return; unlock_free: spin_unlock_irqrestore(&data->drop_queue.lock, flags); u64_stats_update_begin(&data->stats.syncp); u64_stats_inc(&data->stats.dropped); u64_stats_update_end(&data->stats.syncp); consume_skb(nskb); } static void net_dm_packet_trace_napi_poll_hit(void *ignore, struct napi_struct *napi, int work, int budget) { } static size_t net_dm_in_port_size(void) { /* NET_DM_ATTR_IN_PORT nest */ return nla_total_size(0) + /* NET_DM_ATTR_PORT_NETDEV_IFINDEX */ nla_total_size(sizeof(u32)) + /* NET_DM_ATTR_PORT_NETDEV_NAME */ nla_total_size(IFNAMSIZ + 1); } #define NET_DM_MAX_SYMBOL_LEN 40 #define NET_DM_MAX_REASON_LEN 50 static size_t net_dm_packet_report_size(size_t payload_len) { size_t size; size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize); return NLMSG_ALIGN(size) + /* NET_DM_ATTR_ORIGIN */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_PC */ nla_total_size(sizeof(u64)) + /* NET_DM_ATTR_SYMBOL */ nla_total_size(NET_DM_MAX_SYMBOL_LEN + 1) + /* NET_DM_ATTR_IN_PORT */ net_dm_in_port_size() + /* NET_DM_ATTR_TIMESTAMP */ nla_total_size(sizeof(u64)) + /* NET_DM_ATTR_ORIG_LEN */ nla_total_size(sizeof(u32)) + /* NET_DM_ATTR_PROTO */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_REASON */ nla_total_size(NET_DM_MAX_REASON_LEN + 1) + /* NET_DM_ATTR_PAYLOAD */ nla_total_size(payload_len); } static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex, const char *name) { struct nlattr *attr; attr = nla_nest_start(msg, NET_DM_ATTR_IN_PORT); if (!attr) return -EMSGSIZE; if (ifindex && nla_put_u32(msg, NET_DM_ATTR_PORT_NETDEV_IFINDEX, ifindex)) goto nla_put_failure; if (name && nla_put_string(msg, NET_DM_ATTR_PORT_NETDEV_NAME, name)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, size_t payload_len) { struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb); const struct drop_reason_list *list = NULL; unsigned int subsys, subsys_reason; char buf[NET_DM_MAX_SYMBOL_LEN]; struct nlattr *attr; void *hdr; int rc; hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_PACKET_ALERT); if (!hdr) return -EMSGSIZE; if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW)) goto nla_put_failure; if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, (u64)(uintptr_t)cb->pc, NET_DM_ATTR_PAD)) goto nla_put_failure; rcu_read_lock(); subsys = u32_get_bits(cb->reason, SKB_DROP_REASON_SUBSYS_MASK); if (subsys < SKB_DROP_REASON_SUBSYS_NUM) list = rcu_dereference(drop_reasons_by_subsys[subsys]); subsys_reason = cb->reason & ~SKB_DROP_REASON_SUBSYS_MASK; if (!list || subsys_reason >= list->n_reasons || !list->reasons[subsys_reason] || strlen(list->reasons[subsys_reason]) > NET_DM_MAX_REASON_LEN) { list = rcu_dereference(drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_CORE]); subsys_reason = SKB_DROP_REASON_NOT_SPECIFIED; } if (nla_put_string(msg, NET_DM_ATTR_REASON, list->reasons[subsys_reason])) { rcu_read_unlock(); goto nla_put_failure; } rcu_read_unlock(); snprintf(buf, sizeof(buf), "%pS", cb->pc); if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf)) goto nla_put_failure; rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif, NULL); if (rc) goto nla_put_failure; if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len)) goto nla_put_failure; if (!payload_len) goto out; if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol))) goto nla_put_failure; attr = skb_put(msg, nla_total_size(payload_len)); attr->nla_type = NET_DM_ATTR_PAYLOAD; attr->nla_len = nla_attr_size(payload_len); if (skb_copy_bits(skb, 0, nla_data(attr), payload_len)) goto nla_put_failure; out: genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } #define NET_DM_MAX_PACKET_SIZE (0xffff - NLA_HDRLEN - NLA_ALIGNTO) static void net_dm_packet_report(struct sk_buff *skb) { struct sk_buff *msg; size_t payload_len; int rc; /* Make sure we start copying the packet from the MAC header */ if (skb->data > skb_mac_header(skb)) skb_push(skb, skb->data - skb_mac_header(skb)); else skb_pull(skb, skb_mac_header(skb) - skb->data); /* Ensure packet fits inside a single netlink attribute */ payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE); if (net_dm_trunc_len) payload_len = min_t(size_t, net_dm_trunc_len, payload_len); msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL); if (!msg) goto out; rc = net_dm_packet_report_fill(msg, skb, payload_len); if (rc) { nlmsg_free(msg); goto out; } genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); out: consume_skb(skb); } static void net_dm_packet_work(struct work_struct *work) { struct per_cpu_dm_data *data; struct sk_buff_head list; struct sk_buff *skb; unsigned long flags; data = container_of(work, struct per_cpu_dm_data, dm_alert_work); __skb_queue_head_init(&list); spin_lock_irqsave(&data->drop_queue.lock, flags); skb_queue_splice_tail_init(&data->drop_queue, &list); spin_unlock_irqrestore(&data->drop_queue.lock, flags); while ((skb = __skb_dequeue(&list))) net_dm_packet_report(skb); } static size_t net_dm_flow_action_cookie_size(const struct devlink_trap_metadata *hw_metadata) { return hw_metadata->fa_cookie ? nla_total_size(hw_metadata->fa_cookie->cookie_len) : 0; } static size_t net_dm_hw_packet_report_size(size_t payload_len, const struct devlink_trap_metadata *hw_metadata) { size_t size; size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize); return NLMSG_ALIGN(size) + /* NET_DM_ATTR_ORIGIN */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_HW_TRAP_GROUP_NAME */ nla_total_size(strlen(hw_metadata->trap_group_name) + 1) + /* NET_DM_ATTR_HW_TRAP_NAME */ nla_total_size(strlen(hw_metadata->trap_name) + 1) + /* NET_DM_ATTR_IN_PORT */ net_dm_in_port_size() + /* NET_DM_ATTR_FLOW_ACTION_COOKIE */ net_dm_flow_action_cookie_size(hw_metadata) + /* NET_DM_ATTR_TIMESTAMP */ nla_total_size(sizeof(u64)) + /* NET_DM_ATTR_ORIG_LEN */ nla_total_size(sizeof(u32)) + /* NET_DM_ATTR_PROTO */ nla_total_size(sizeof(u16)) + /* NET_DM_ATTR_PAYLOAD */ nla_total_size(payload_len); } static int net_dm_hw_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb, size_t payload_len) { struct devlink_trap_metadata *hw_metadata; struct nlattr *attr; void *hdr; hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0, NET_DM_CMD_PACKET_ALERT); if (!hdr) return -EMSGSIZE; if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_HW)) goto nla_put_failure; if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_GROUP_NAME, hw_metadata->trap_group_name)) goto nla_put_failure; if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_metadata->trap_name)) goto nla_put_failure; if (hw_metadata->input_dev) { struct net_device *dev = hw_metadata->input_dev; int rc; rc = net_dm_packet_report_in_port_put(msg, dev->ifindex, dev->name); if (rc) goto nla_put_failure; } if (hw_metadata->fa_cookie && nla_put(msg, NET_DM_ATTR_FLOW_ACTION_COOKIE, hw_metadata->fa_cookie->cookie_len, hw_metadata->fa_cookie->cookie)) goto nla_put_failure; if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP, ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len)) goto nla_put_failure; if (!payload_len) goto out; if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol))) goto nla_put_failure; attr = skb_put(msg, nla_total_size(payload_len)); attr->nla_type = NET_DM_ATTR_PAYLOAD; attr->nla_len = nla_attr_size(payload_len); if (skb_copy_bits(skb, 0, nla_data(attr), payload_len)) goto nla_put_failure; out: genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static struct devlink_trap_metadata * net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata) { const struct flow_action_cookie *fa_cookie; struct devlink_trap_metadata *hw_metadata; const char *trap_group_name; const char *trap_name; hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC); if (!hw_metadata) return NULL; trap_group_name = kstrdup(metadata->trap_group_name, GFP_ATOMIC); if (!trap_group_name) goto free_hw_metadata; hw_metadata->trap_group_name = trap_group_name; trap_name = kstrdup(metadata->trap_name, GFP_ATOMIC); if (!trap_name) goto free_trap_group; hw_metadata->trap_name = trap_name; if (metadata->fa_cookie) { size_t cookie_size = sizeof(*fa_cookie) + metadata->fa_cookie->cookie_len; fa_cookie = kmemdup(metadata->fa_cookie, cookie_size, GFP_ATOMIC); if (!fa_cookie) goto free_trap_name; hw_metadata->fa_cookie = fa_cookie; } hw_metadata->input_dev = metadata->input_dev; netdev_hold(hw_metadata->input_dev, &hw_metadata->dev_tracker, GFP_ATOMIC); return hw_metadata; free_trap_name: kfree(trap_name); free_trap_group: kfree(trap_group_name); free_hw_metadata: kfree(hw_metadata); return NULL; } static void net_dm_hw_metadata_free(struct devlink_trap_metadata *hw_metadata) { netdev_put(hw_metadata->input_dev, &hw_metadata->dev_tracker); kfree(hw_metadata->fa_cookie); kfree(hw_metadata->trap_name); kfree(hw_metadata->trap_group_name); kfree(hw_metadata); } static void net_dm_hw_packet_report(struct sk_buff *skb) { struct devlink_trap_metadata *hw_metadata; struct sk_buff *msg; size_t payload_len; int rc; if (skb->data > skb_mac_header(skb)) skb_push(skb, skb->data - skb_mac_header(skb)); else skb_pull(skb, skb_mac_header(skb) - skb->data); payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE); if (net_dm_trunc_len) payload_len = min_t(size_t, net_dm_trunc_len, payload_len); hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; msg = nlmsg_new(net_dm_hw_packet_report_size(payload_len, hw_metadata), GFP_KERNEL); if (!msg) goto out; rc = net_dm_hw_packet_report_fill(msg, skb, payload_len); if (rc) { nlmsg_free(msg); goto out; } genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL); out: net_dm_hw_metadata_free(NET_DM_SKB_CB(skb)->hw_metadata); consume_skb(skb); } static void net_dm_hw_packet_work(struct work_struct *work) { struct per_cpu_dm_data *hw_data; struct sk_buff_head list; struct sk_buff *skb; unsigned long flags; hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work); __skb_queue_head_init(&list); spin_lock_irqsave(&hw_data->drop_queue.lock, flags); skb_queue_splice_tail_init(&hw_data->drop_queue, &list); spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); while ((skb = __skb_dequeue(&list))) net_dm_hw_packet_report(skb); } static void net_dm_hw_trap_packet_probe(void *ignore, const struct devlink *devlink, struct sk_buff *skb, const struct devlink_trap_metadata *metadata) { struct devlink_trap_metadata *n_hw_metadata; ktime_t tstamp = ktime_get_real(); struct per_cpu_dm_data *hw_data; struct sk_buff *nskb; unsigned long flags; if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL) return; if (!skb_mac_header_was_set(skb)) return; nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return; n_hw_metadata = net_dm_hw_metadata_copy(metadata); if (!n_hw_metadata) goto free; NET_DM_SKB_CB(nskb)->hw_metadata = n_hw_metadata; nskb->tstamp = tstamp; hw_data = this_cpu_ptr(&dm_hw_cpu_data); spin_lock_irqsave(&hw_data->drop_queue.lock, flags); if (skb_queue_len(&hw_data->drop_queue) < net_dm_queue_len) __skb_queue_tail(&hw_data->drop_queue, nskb); else goto unlock_free; spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); schedule_work(&hw_data->dm_alert_work); return; unlock_free: spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags); u64_stats_update_begin(&hw_data->stats.syncp); u64_stats_inc(&hw_data->stats.dropped); u64_stats_update_end(&hw_data->stats.syncp); net_dm_hw_metadata_free(n_hw_metadata); free: consume_skb(nskb); } static const struct net_dm_alert_ops net_dm_alert_packet_ops = { .kfree_skb_probe = net_dm_packet_trace_kfree_skb_hit, .napi_poll_probe = net_dm_packet_trace_napi_poll_hit, .work_item_func = net_dm_packet_work, .hw_work_item_func = net_dm_hw_packet_work, .hw_trap_probe = net_dm_hw_trap_packet_probe, }; static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = { [NET_DM_ALERT_MODE_SUMMARY] = &net_dm_alert_summary_ops, [NET_DM_ALERT_MODE_PACKET] = &net_dm_alert_packet_ops, }; #if IS_ENABLED(CONFIG_NET_DEVLINK) static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops) { return register_trace_devlink_trap_report(ops->hw_trap_probe, NULL); } static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops) { unregister_trace_devlink_trap_report(ops->hw_trap_probe, NULL); tracepoint_synchronize_unregister(); } #else static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops) { return -EOPNOTSUPP; } static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops) { } #endif static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack) { const struct net_dm_alert_ops *ops; int cpu, rc; if (monitor_hw) { NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already enabled"); return -EAGAIN; } ops = net_dm_alert_ops_arr[net_dm_alert_mode]; if (!try_module_get(THIS_MODULE)) { NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module"); return -ENODEV; } for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct net_dm_hw_entries *hw_entries; INIT_WORK(&hw_data->dm_alert_work, ops->hw_work_item_func); timer_setup(&hw_data->send_timer, sched_send_work, 0); hw_entries = net_dm_hw_reset_per_cpu_data(hw_data); kfree(hw_entries); } rc = net_dm_hw_probe_register(ops); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to devlink_trap_probe() tracepoint"); goto err_module_put; } monitor_hw = true; return 0; err_module_put: for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&hw_data->send_timer); cancel_work_sync(&hw_data->dm_alert_work); while ((skb = __skb_dequeue(&hw_data->drop_queue))) { struct devlink_trap_metadata *hw_metadata; hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; net_dm_hw_metadata_free(hw_metadata); consume_skb(skb); } } module_put(THIS_MODULE); return rc; } static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack) { const struct net_dm_alert_ops *ops; int cpu; if (!monitor_hw) { NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already disabled"); return; } ops = net_dm_alert_ops_arr[net_dm_alert_mode]; monitor_hw = false; net_dm_hw_probe_unregister(ops); for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&hw_data->send_timer); cancel_work_sync(&hw_data->dm_alert_work); while ((skb = __skb_dequeue(&hw_data->drop_queue))) { struct devlink_trap_metadata *hw_metadata; hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata; net_dm_hw_metadata_free(hw_metadata); consume_skb(skb); } } module_put(THIS_MODULE); } static int net_dm_trace_on_set(struct netlink_ext_ack *extack) { const struct net_dm_alert_ops *ops; int cpu, rc; ops = net_dm_alert_ops_arr[net_dm_alert_mode]; if (!try_module_get(THIS_MODULE)) { NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module"); return -ENODEV; } for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; INIT_WORK(&data->dm_alert_work, ops->work_item_func); timer_setup(&data->send_timer, sched_send_work, 0); /* Allocate a new per-CPU skb for the summary alert message and * free the old one which might contain stale data from * previous tracing. */ skb = reset_per_cpu_data(data); consume_skb(skb); } rc = register_trace_kfree_skb(ops->kfree_skb_probe, NULL); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to kfree_skb() tracepoint"); goto err_module_put; } rc = register_trace_napi_poll(ops->napi_poll_probe, NULL); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to napi_poll() tracepoint"); goto err_unregister_trace; } return 0; err_unregister_trace: unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL); err_module_put: for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&data->send_timer); cancel_work_sync(&data->dm_alert_work); while ((skb = __skb_dequeue(&data->drop_queue))) consume_skb(skb); } module_put(THIS_MODULE); return rc; } static void net_dm_trace_off_set(void) { const struct net_dm_alert_ops *ops; int cpu; ops = net_dm_alert_ops_arr[net_dm_alert_mode]; unregister_trace_napi_poll(ops->napi_poll_probe, NULL); unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL); tracepoint_synchronize_unregister(); /* Make sure we do not send notifications to user space after request * to stop tracing returns. */ for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; del_timer_sync(&data->send_timer); cancel_work_sync(&data->dm_alert_work); while ((skb = __skb_dequeue(&data->drop_queue))) consume_skb(skb); } module_put(THIS_MODULE); } static int set_all_monitor_traces(int state, struct netlink_ext_ack *extack) { int rc = 0; if (state == trace_state) { NL_SET_ERR_MSG_MOD(extack, "Trace state already set to requested state"); return -EAGAIN; } switch (state) { case TRACE_ON: rc = net_dm_trace_on_set(extack); break; case TRACE_OFF: net_dm_trace_off_set(); break; default: rc = 1; break; } if (!rc) trace_state = state; else rc = -EINPROGRESS; return rc; } static bool net_dm_is_monitoring(void) { return trace_state == TRACE_ON || monitor_hw; } static int net_dm_alert_mode_get_from_info(struct genl_info *info, enum net_dm_alert_mode *p_alert_mode) { u8 val; val = nla_get_u8(info->attrs[NET_DM_ATTR_ALERT_MODE]); switch (val) { case NET_DM_ALERT_MODE_SUMMARY: case NET_DM_ALERT_MODE_PACKET: *p_alert_mode = val; break; default: return -EINVAL; } return 0; } static int net_dm_alert_mode_set(struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; enum net_dm_alert_mode alert_mode; int rc; if (!info->attrs[NET_DM_ATTR_ALERT_MODE]) return 0; rc = net_dm_alert_mode_get_from_info(info, &alert_mode); if (rc) { NL_SET_ERR_MSG_MOD(extack, "Invalid alert mode"); return -EINVAL; } net_dm_alert_mode = alert_mode; return 0; } static void net_dm_trunc_len_set(struct genl_info *info) { if (!info->attrs[NET_DM_ATTR_TRUNC_LEN]) return; net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]); } static void net_dm_queue_len_set(struct genl_info *info) { if (!info->attrs[NET_DM_ATTR_QUEUE_LEN]) return; net_dm_queue_len = nla_get_u32(info->attrs[NET_DM_ATTR_QUEUE_LEN]); } static int net_dm_cmd_config(struct sk_buff *skb, struct genl_info *info) { struct netlink_ext_ack *extack = info->extack; int rc; if (net_dm_is_monitoring()) { NL_SET_ERR_MSG_MOD(extack, "Cannot configure drop monitor during monitoring"); return -EBUSY; } rc = net_dm_alert_mode_set(info); if (rc) return rc; net_dm_trunc_len_set(info); net_dm_queue_len_set(info); return 0; } static int net_dm_monitor_start(bool set_sw, bool set_hw, struct netlink_ext_ack *extack) { bool sw_set = false; int rc; if (set_sw) { rc = set_all_monitor_traces(TRACE_ON, extack); if (rc) return rc; sw_set = true; } if (set_hw) { rc = net_dm_hw_monitor_start(extack); if (rc) goto err_monitor_hw; } return 0; err_monitor_hw: if (sw_set) set_all_monitor_traces(TRACE_OFF, extack); return rc; } static void net_dm_monitor_stop(bool set_sw, bool set_hw, struct netlink_ext_ack *extack) { if (set_hw) net_dm_hw_monitor_stop(extack); if (set_sw) set_all_monitor_traces(TRACE_OFF, extack); } static int net_dm_cmd_trace(struct sk_buff *skb, struct genl_info *info) { bool set_sw = !!info->attrs[NET_DM_ATTR_SW_DROPS]; bool set_hw = !!info->attrs[NET_DM_ATTR_HW_DROPS]; struct netlink_ext_ack *extack = info->extack; /* To maintain backward compatibility, we start / stop monitoring of * software drops if no flag is specified. */ if (!set_sw && !set_hw) set_sw = true; switch (info->genlhdr->cmd) { case NET_DM_CMD_START: return net_dm_monitor_start(set_sw, set_hw, extack); case NET_DM_CMD_STOP: net_dm_monitor_stop(set_sw, set_hw, extack); return 0; } return -EOPNOTSUPP; } static int net_dm_config_fill(struct sk_buff *msg, struct genl_info *info) { void *hdr; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &net_drop_monitor_family, 0, NET_DM_CMD_CONFIG_NEW); if (!hdr) return -EMSGSIZE; if (nla_put_u8(msg, NET_DM_ATTR_ALERT_MODE, net_dm_alert_mode)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_TRUNC_LEN, net_dm_trunc_len)) goto nla_put_failure; if (nla_put_u32(msg, NET_DM_ATTR_QUEUE_LEN, net_dm_queue_len)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int net_dm_cmd_config_get(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; int rc; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rc = net_dm_config_fill(msg, info); if (rc) goto free_msg; return genlmsg_reply(msg, info); free_msg: nlmsg_free(msg); return rc; } static void net_dm_stats_read(struct net_dm_stats *stats) { int cpu; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(cpu) { struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct net_dm_stats *cpu_stats = &data->stats; unsigned int start; u64 dropped; do { start = u64_stats_fetch_begin(&cpu_stats->syncp); dropped = u64_stats_read(&cpu_stats->dropped); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->dropped, dropped); } } static int net_dm_stats_put(struct sk_buff *msg) { struct net_dm_stats stats; struct nlattr *attr; net_dm_stats_read(&stats); attr = nla_nest_start(msg, NET_DM_ATTR_STATS); if (!attr) return -EMSGSIZE; if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static void net_dm_hw_stats_read(struct net_dm_stats *stats) { int cpu; memset(stats, 0, sizeof(*stats)); for_each_possible_cpu(cpu) { struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct net_dm_stats *cpu_stats = &hw_data->stats; unsigned int start; u64 dropped; do { start = u64_stats_fetch_begin(&cpu_stats->syncp); dropped = u64_stats_read(&cpu_stats->dropped); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->dropped, dropped); } } static int net_dm_hw_stats_put(struct sk_buff *msg) { struct net_dm_stats stats; struct nlattr *attr; net_dm_hw_stats_read(&stats); attr = nla_nest_start(msg, NET_DM_ATTR_HW_STATS); if (!attr) return -EMSGSIZE; if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED, u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD)) goto nla_put_failure; nla_nest_end(msg, attr); return 0; nla_put_failure: nla_nest_cancel(msg, attr); return -EMSGSIZE; } static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info) { void *hdr; int rc; hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &net_drop_monitor_family, 0, NET_DM_CMD_STATS_NEW); if (!hdr) return -EMSGSIZE; rc = net_dm_stats_put(msg); if (rc) goto nla_put_failure; rc = net_dm_hw_stats_put(msg); if (rc) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int net_dm_cmd_stats_get(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; int rc; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rc = net_dm_stats_fill(msg, info); if (rc) goto free_msg; return genlmsg_reply(msg, info); free_msg: nlmsg_free(msg); return rc; } static int dropmon_net_event(struct notifier_block *ev_block, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct dm_hw_stat_delta *stat; switch (event) { case NETDEV_REGISTER: if (WARN_ON_ONCE(rtnl_dereference(dev->dm_private))) break; stat = kzalloc(sizeof(*stat), GFP_KERNEL); if (!stat) break; stat->last_rx = jiffies; rcu_assign_pointer(dev->dm_private, stat); break; case NETDEV_UNREGISTER: stat = rtnl_dereference(dev->dm_private); if (stat) { rcu_assign_pointer(dev->dm_private, NULL); kfree_rcu(stat, rcu); } break; } return NOTIFY_DONE; } static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = { [NET_DM_ATTR_UNSPEC] = { .strict_start_type = NET_DM_ATTR_UNSPEC + 1 }, [NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 }, [NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 }, [NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 }, [NET_DM_ATTR_SW_DROPS] = {. type = NLA_FLAG }, [NET_DM_ATTR_HW_DROPS] = {. type = NLA_FLAG }, }; static const struct genl_small_ops dropmon_ops[] = { { .cmd = NET_DM_CMD_CONFIG, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_config, .flags = GENL_ADMIN_PERM, }, { .cmd = NET_DM_CMD_START, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, }, { .cmd = NET_DM_CMD_STOP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = net_dm_cmd_trace, }, { .cmd = NET_DM_CMD_CONFIG_GET, .doit = net_dm_cmd_config_get, }, { .cmd = NET_DM_CMD_STATS_GET, .doit = net_dm_cmd_stats_get, }, }; static int net_dm_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { mutex_lock(&net_dm_mutex); return 0; } static void net_dm_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { mutex_unlock(&net_dm_mutex); } static struct genl_family net_drop_monitor_family __ro_after_init = { .hdrsize = 0, .name = "NET_DM", .version = 2, .maxattr = NET_DM_ATTR_MAX, .policy = net_dm_nl_policy, .pre_doit = net_dm_nl_pre_doit, .post_doit = net_dm_nl_post_doit, .module = THIS_MODULE, .small_ops = dropmon_ops, .n_small_ops = ARRAY_SIZE(dropmon_ops), .resv_start_op = NET_DM_CMD_STATS_GET + 1, .mcgrps = dropmon_mcgrps, .n_mcgrps = ARRAY_SIZE(dropmon_mcgrps), }; static struct notifier_block dropmon_net_notifier = { .notifier_call = dropmon_net_event }; static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data) { spin_lock_init(&data->lock); skb_queue_head_init(&data->drop_queue); u64_stats_init(&data->stats.syncp); } static void __net_dm_cpu_data_fini(struct per_cpu_dm_data *data) { WARN_ON(!skb_queue_empty(&data->drop_queue)); } static void net_dm_cpu_data_init(int cpu) { struct per_cpu_dm_data *data; data = &per_cpu(dm_cpu_data, cpu); __net_dm_cpu_data_init(data); } static void net_dm_cpu_data_fini(int cpu) { struct per_cpu_dm_data *data; data = &per_cpu(dm_cpu_data, cpu); /* At this point, we should have exclusive access * to this struct and can free the skb inside it. */ consume_skb(data->skb); __net_dm_cpu_data_fini(data); } static void net_dm_hw_cpu_data_init(int cpu) { struct per_cpu_dm_data *hw_data; hw_data = &per_cpu(dm_hw_cpu_data, cpu); __net_dm_cpu_data_init(hw_data); } static void net_dm_hw_cpu_data_fini(int cpu) { struct per_cpu_dm_data *hw_data; hw_data = &per_cpu(dm_hw_cpu_data, cpu); kfree(hw_data->hw_entries); __net_dm_cpu_data_fini(hw_data); } static int __init init_net_drop_monitor(void) { int cpu, rc; pr_info("Initializing network drop monitor service\n"); if (sizeof(void *) > 8) { pr_err("Unable to store program counters on this arch, Drop monitor failed\n"); return -ENOSPC; } rc = genl_register_family(&net_drop_monitor_family); if (rc) { pr_err("Could not create drop monitor netlink family\n"); return rc; } WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = register_netdevice_notifier(&dropmon_net_notifier); if (rc < 0) { pr_crit("Failed to register netdevice notifier\n"); goto out_unreg; } rc = 0; for_each_possible_cpu(cpu) { net_dm_cpu_data_init(cpu); net_dm_hw_cpu_data_init(cpu); } goto out; out_unreg: genl_unregister_family(&net_drop_monitor_family); out: return rc; } static void exit_net_drop_monitor(void) { int cpu; BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); /* * Because of the module_get/put we do in the trace state change path * we are guaranteed not to have any current users when we get here */ for_each_possible_cpu(cpu) { net_dm_hw_cpu_data_fini(cpu); net_dm_cpu_data_fini(cpu); } BUG_ON(genl_unregister_family(&net_drop_monitor_family)); } module_init(init_net_drop_monitor); module_exit(exit_net_drop_monitor); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>"); MODULE_ALIAS_GENL_FAMILY("NET_DM"); MODULE_DESCRIPTION("Monitoring code for network dropped packet alerts");
21 21 21 21 21 21 20 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * * This file contains the core interrupt handling code. Detailed * information is available in Documentation/core-api/genericirq.rst * */ #include <linux/irq.h> #include <linux/random.h> #include <linux/sched.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <asm/irq_regs.h> #include <trace/events/irq.h> #include "internals.h" #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER void (*handle_arch_irq)(struct pt_regs *) __ro_after_init; #endif /** * handle_bad_irq - handle spurious and unhandled irqs * @desc: description of the interrupt * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ void handle_bad_irq(struct irq_desc *desc) { unsigned int irq = irq_desc_get_irq(desc); print_irq_desc(irq, desc); kstat_incr_irqs_this_cpu(desc); ack_bad_irq(irq); } EXPORT_SYMBOL_GPL(handle_bad_irq); /* * Special, empty irq handler: */ irqreturn_t no_action(int cpl, void *dev_id) { return IRQ_NONE; } EXPORT_SYMBOL_GPL(no_action); static void warn_no_thread(unsigned int irq, struct irqaction *action) { if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags)) return; printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD " "but no thread function available.", irq, action->name); } void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) { /* * In case the thread crashed and was killed we just pretend that * we handled the interrupt. The hardirq handler has disabled the * device interrupt, so no irq storm is lurking. */ if (action->thread->flags & PF_EXITING) return; /* * Wake up the handler thread for this action. If the * RUNTHREAD bit is already set, nothing to do. */ if (test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) return; /* * It's safe to OR the mask lockless here. We have only two * places which write to threads_oneshot: This code and the * irq thread. * * This code is the hard irq context and can never run on two * cpus in parallel. If it ever does we have more serious * problems than this bitmask. * * The irq threads of this irq which clear their "running" bit * in threads_oneshot are serialized via desc->lock against * each other and they are serialized against this code by * IRQS_INPROGRESS. * * Hard irq handler: * * spin_lock(desc->lock); * desc->state |= IRQS_INPROGRESS; * spin_unlock(desc->lock); * set_bit(IRQTF_RUNTHREAD, &action->thread_flags); * desc->threads_oneshot |= mask; * spin_lock(desc->lock); * desc->state &= ~IRQS_INPROGRESS; * spin_unlock(desc->lock); * * irq thread: * * again: * spin_lock(desc->lock); * if (desc->state & IRQS_INPROGRESS) { * spin_unlock(desc->lock); * while(desc->state & IRQS_INPROGRESS) * cpu_relax(); * goto again; * } * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) * desc->threads_oneshot &= ~mask; * spin_unlock(desc->lock); * * So either the thread waits for us to clear IRQS_INPROGRESS * or we are waiting in the flow handler for desc->lock to be * released before we reach this point. The thread also checks * IRQTF_RUNTHREAD under desc->lock. If set it leaves * threads_oneshot untouched and runs the thread another time. */ desc->threads_oneshot |= action->thread_mask; /* * We increment the threads_active counter in case we wake up * the irq thread. The irq thread decrements the counter when * it returns from the handler or in the exit path and wakes * up waiters which are stuck in synchronize_irq() when the * active count becomes zero. synchronize_irq() is serialized * against this code (hard irq handler) via IRQS_INPROGRESS * like the finalize_oneshot() code. See comment above. */ atomic_inc(&desc->threads_active); wake_up_process(action->thread); } irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; unsigned int irq = desc->irq_data.irq; struct irqaction *action; record_irq_time(desc); for_each_action_of_desc(desc, action) { irqreturn_t res; /* * If this IRQ would be threaded under force_irqthreads, mark it so. */ if (irq_settings_can_thread(desc) && !(action->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))) lockdep_hardirq_threaded(); trace_irq_handler_entry(irq, action); res = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, res); if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n", irq, action->handler)) local_irq_disable(); switch (res) { case IRQ_WAKE_THREAD: /* * Catch drivers which return WAKE_THREAD but * did not set up a thread function */ if (unlikely(!action->thread_fn)) { warn_no_thread(irq, action); break; } __irq_wake_thread(desc, action); break; default: break; } retval |= res; } return retval; } irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval; retval = __handle_irq_event_percpu(desc); add_interrupt_randomness(desc->irq_data.irq); if (!irq_settings_no_debug(desc)) note_interrupt(desc, retval); return retval; } irqreturn_t handle_irq_event(struct irq_desc *desc) { irqreturn_t ret; desc->istate &= ~IRQS_PENDING; irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); ret = handle_irq_event_percpu(desc); raw_spin_lock(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); return ret; } #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)) { if (handle_arch_irq) return -EBUSY; handle_arch_irq = handle_irq; return 0; } /** * generic_handle_arch_irq - root irq handler for architectures which do no * entry accounting themselves * @regs: Register file coming from the low-level handling code */ asmlinkage void noinstr generic_handle_arch_irq(struct pt_regs *regs) { struct pt_regs *old_regs; irq_enter(); old_regs = set_irq_regs(regs); handle_arch_irq(regs); set_irq_regs(old_regs); irq_exit(); } #endif
1 1 1 1 1 1 1 1 232 232 232 232 232 232 232 232 6559 6560 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 // SPDX-License-Identifier: GPL-2.0-only /* * mm/mmap.c * * Written by obz. * * Address space accounting code <alan@lxorguk.ukuu.org.uk> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/slab.h> #include <linux/backing-dev.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/capability.h> #include <linux/init.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/profile.h> #include <linux/export.h> #include <linux/mount.h> #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> #include <linux/mmdebug.h> #include <linux/perf_event.h> #include <linux/audit.h> #include <linux/khugepaged.h> #include <linux/uprobes.h> #include <linux/notifier.h> #include <linux/memory.h> #include <linux/printk.h> #include <linux/userfaultfd_k.h> #include <linux/moduleparam.h> #include <linux/pkeys.h> #include <linux/oom.h> #include <linux/sched/mm.h> #include <linux/ksm.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlb.h> #include <asm/mmu_context.h> #define CREATE_TRACE_POINTS #include <trace/events/mmap.h> #include "internal.h" #ifndef arch_mmap_check #define arch_mmap_check(addr, len, flags) (0) #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX; int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN; const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; #endif static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); static void unmap_region(struct mm_struct *mm, struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next, unsigned long start, unsigned long end, unsigned long tree_end, bool mm_wr_locked); static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); } /* Update vma->vm_page_prot to reflect vma->vm_flags. */ void vma_set_page_prot(struct vm_area_struct *vma) { unsigned long vm_flags = vma->vm_flags; pgprot_t vm_page_prot; vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); if (vma_wants_writenotify(vma, vm_page_prot)) { vm_flags &= ~VM_SHARED; vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags); } /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ WRITE_ONCE(vma->vm_page_prot, vm_page_prot); } /* * Requires inode->i_mapping->i_mmap_rwsem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) { if (vma->vm_flags & VM_SHARED) mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* * Unlink a file-based vm structure from its interval tree, to hide * vma from rmap and vmtruncate before freeing its page tables. */ void unlink_file_vma(struct vm_area_struct *vma) { struct file *file = vma->vm_file; if (file) { struct address_space *mapping = file->f_mapping; i_mmap_lock_write(mapping); __remove_shared_vm_struct(vma, file, mapping); i_mmap_unlock_write(mapping); } } /* * Close a vm structure and free it. */ static void remove_vma(struct vm_area_struct *vma, bool unreachable) { might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); if (unreachable) __vm_area_free(vma); else vm_area_free(vma); } static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, unsigned long min) { return mas_prev(&vmi->mas, min); } /* * check_brk_limits() - Use platform specific check of range & verify mlock * limits. * @addr: The address to check * @len: The size of increase. * * Return: 0 on success. */ static int check_brk_limits(unsigned long addr, unsigned long len) { unsigned long mapped_addr; mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); if (IS_ERR_VALUE(mapped_addr)) return mapped_addr; return mlock_future_ok(current->mm, current->mm->def_flags, len) ? 0 : -EAGAIN; } static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; struct vm_area_struct *brkvma, *next = NULL; unsigned long min_brk; bool populate = false; LIST_HEAD(uf); struct vma_iterator vmi; if (mmap_write_lock_killable(mm)) return -EINTR; origbrk = mm->brk; #ifdef CONFIG_COMPAT_BRK /* * CONFIG_COMPAT_BRK can still be overridden by setting * randomize_va_space to 2, which will still cause mm->start_brk * to be arbitrarily shifted */ if (current->brk_randomized) min_brk = mm->start_brk; else min_brk = mm->end_data; #else min_brk = mm->start_brk; #endif if (brk < min_brk) goto out; /* * Check against rlimit here. If this check is done later after the test * of oldbrk with newbrk then it can escape the test and let the data * segment grow beyond its set limit the in case where the limit is * not page aligned -Ram Gupta */ if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, mm->end_data, mm->start_data)) goto out; newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); if (oldbrk == newbrk) { mm->brk = brk; goto success; } /* Always allow shrinking brk. */ if (brk <= mm->brk) { /* Search one past newbrk */ vma_iter_init(&vmi, mm, newbrk); brkvma = vma_find(&vmi, oldbrk); if (!brkvma || brkvma->vm_start >= oldbrk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. * do_vma_munmap() will drop the lock on success, so update it * before calling do_vma_munmap(). */ mm->brk = brk; if (do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true)) goto out; goto success_unlocked; } if (check_brk_limits(oldbrk, newbrk - oldbrk)) goto out; /* * Only check if the next VMA is within the stack_guard_gap of the * expansion area */ vma_iter_init(&vmi, mm, oldbrk); next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; brkvma = vma_prev_limit(&vmi, mm->start_brk); /* Ok, looks good - let it rip. */ if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; mm->brk = brk; if (mm->def_flags & VM_LOCKED) populate = true; success: mmap_write_unlock(mm); success_unlocked: userfaultfd_unmap_complete(mm, &uf); if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; out: mm->brk = origbrk; mmap_write_unlock(mm); return origbrk; } #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); mt_validate(&mm->mm_mt); for_each_vma(vmi, vma) { #ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; #endif unsigned long vmi_start, vmi_end; bool warn = 0; vmi_start = vma_iter_addr(&vmi); vmi_end = vma_iter_end(&vmi); if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) warn = 1; if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) warn = 1; if (warn) { pr_emerg("issue in %s\n", current->comm); dump_stack(); dump_vma(vma); pr_emerg("tree range: %px start %lx end %lx\n", vma, vmi_start, vmi_end - 1); vma_iter_dump_tree(&vmi); } #ifdef CONFIG_DEBUG_VM_RB if (anon_vma) { anon_vma_lock_read(anon_vma); list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_verify(avc); anon_vma_unlock_read(anon_vma); } #endif i++; } if (i != mm->map_count) { pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); } #else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ #define validate_mm(mm) do { } while (0) #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ /* * vma has some anon_vma assigned, and is already inserted on that * anon_vma's interval trees. * * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the * vma must be removed from the anon_vma's interval trees using * anon_vma_interval_tree_pre_update_vma(). * * After the update, the vma will be reinserted using * anon_vma_interval_tree_post_update_vma(). * * The entire update must be protected by exclusive mmap_lock and by * the root anon_vma's mutex. */ static inline void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) { struct anon_vma_chain *avc; list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); } static inline void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) { struct anon_vma_chain *avc; list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); } static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { VMA_ITERATOR(vmi, mm, addr); struct vm_area_struct *vma; unsigned long nr_pages = 0; for_each_vma_range(vmi, vma, end) { unsigned long vm_start = max(addr, vma->vm_start); unsigned long vm_end = min(end, vma->vm_end); nr_pages += PHYS_PFN(vm_end - vm_start); } return nr_pages; } static void __vma_link_file(struct vm_area_struct *vma, struct address_space *mapping) { if (vma->vm_flags & VM_SHARED) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { VMA_ITERATOR(vmi, mm, 0); struct address_space *mapping = NULL; vma_iter_config(&vmi, vma->vm_start, vma->vm_end); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; vma_start_write(vma); vma_iter_store(&vmi, vma); if (vma->vm_file) { mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); __vma_link_file(vma, mapping); i_mmap_unlock_write(mapping); } mm->map_count++; validate_mm(mm); return 0; } /* * init_multi_vma_prep() - Initializer for struct vma_prepare * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked * @next: The next vma if it is to be adjusted * @remove: The first vma to be removed * @remove2: The second vma to be removed */ static inline void init_multi_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma, struct vm_area_struct *next, struct vm_area_struct *remove, struct vm_area_struct *remove2) { memset(vp, 0, sizeof(struct vma_prepare)); vp->vma = vma; vp->anon_vma = vma->anon_vma; vp->remove = remove; vp->remove2 = remove2; vp->adj_next = next; if (!vp->anon_vma && next) vp->anon_vma = next->anon_vma; vp->file = vma->vm_file; if (vp->file) vp->mapping = vma->vm_file->f_mapping; } /* * init_vma_prep() - Initializer wrapper for vma_prepare struct * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked */ static inline void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma) { init_multi_vma_prep(vp, vma, NULL, NULL, NULL); } /* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct */ static inline void vma_prepare(struct vma_prepare *vp) { if (vp->file) { uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); if (vp->adj_next) uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, vp->adj_next->vm_end); i_mmap_lock_write(vp->mapping); if (vp->insert && vp->insert->vm_file) { /* * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. */ __vma_link_file(vp->insert, vp->insert->vm_file->f_mapping); } } if (vp->anon_vma) { anon_vma_lock_write(vp->anon_vma); anon_vma_interval_tree_pre_update_vma(vp->vma); if (vp->adj_next) anon_vma_interval_tree_pre_update_vma(vp->adj_next); } if (vp->file) { flush_dcache_mmap_lock(vp->mapping); vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); if (vp->adj_next) vma_interval_tree_remove(vp->adj_next, &vp->mapping->i_mmap); } } /* * vma_complete- Helper function for handling the unlocking after altering VMAs, * or for inserting a VMA. * * @vp: The vma_prepare struct * @vmi: The vma iterator * @mm: The mm_struct */ static inline void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, struct mm_struct *mm) { if (vp->file) { if (vp->adj_next) vma_interval_tree_insert(vp->adj_next, &vp->mapping->i_mmap); vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); flush_dcache_mmap_unlock(vp->mapping); } if (vp->remove && vp->file) { __remove_shared_vm_struct(vp->remove, vp->file, vp->mapping); if (vp->remove2) __remove_shared_vm_struct(vp->remove2, vp->file, vp->mapping); } else if (vp->insert) { /* * split_vma has split insert from vma, and needs * us to insert it before dropping the locks * (it may either follow vma or precede it). */ vma_iter_store(vmi, vp->insert); mm->map_count++; } if (vp->anon_vma) { anon_vma_interval_tree_post_update_vma(vp->vma); if (vp->adj_next) anon_vma_interval_tree_post_update_vma(vp->adj_next); anon_vma_unlock_write(vp->anon_vma); } if (vp->file) { i_mmap_unlock_write(vp->mapping); uprobe_mmap(vp->vma); if (vp->adj_next) uprobe_mmap(vp->adj_next); } if (vp->remove) { again: vma_mark_detached(vp->remove, true); if (vp->file) { uprobe_munmap(vp->remove, vp->remove->vm_start, vp->remove->vm_end); fput(vp->file); } if (vp->remove->anon_vma) anon_vma_merge(vp->vma, vp->remove); mm->map_count--; mpol_put(vma_policy(vp->remove)); if (!vp->remove2) WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); vm_area_free(vp->remove); /* * In mprotect's case 6 (see comments on vma_merge), * we are removing both mid and next vmas */ if (vp->remove2) { vp->remove = vp->remove2; vp->remove2 = NULL; goto again; } } if (vp->insert && vp->file) uprobe_mmap(vp->insert); validate_mm(mm); } /* * dup_anon_vma() - Helper function to duplicate anon_vma * @dst: The destination VMA * @src: The source VMA * * Returns: 0 on success. */ static inline int dup_anon_vma(struct vm_area_struct *dst, struct vm_area_struct *src) { /* * Easily overlooked: when mprotect shifts the boundary, make sure the * expanding vma has anon_vma set if the shrinking vma had, to cover any * anon pages imported. */ if (src->anon_vma && !dst->anon_vma) { vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; return anon_vma_clone(dst, src); } return 0; } /* * vma_expand - Expand an existing VMA * * @vmi: The vma iterator * @vma: The vma to expand * @start: The start of the vma * @end: The exclusive end of the vma * @pgoff: The page offset of vma * @next: The current of next vma. * * Expand @vma to @start and @end. Can expand off the start and end. Will * expand over @next if it's different from @vma and @end == @next->vm_end. * Checking if the @vma can expand and merge with @next needs to be handled by * the caller. * * Returns: 0 on success */ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *next) { bool remove_next = false; struct vma_prepare vp; vma_start_write(vma); if (next && (vma != next) && (end == next->vm_end)) { int ret; remove_next = true; vma_start_write(next); ret = dup_anon_vma(vma, next); if (ret) return ret; } init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL); /* Not merging but overwriting any part of next is not handled. */ VM_WARN_ON(next && !vp.remove && next != vma && end > next->vm_start); /* Only handles expanding */ VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); /* Note: vma iterator must be pointing to 'start' */ vma_iter_config(vmi, start, end); if (vma_iter_prealloc(vmi, vma)) goto nomem; vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; vma_iter_store(vmi, vma); vma_complete(&vp, vmi, vma->vm_mm); return 0; nomem: return -ENOMEM; } /* * vma_shrink() - Reduce an existing VMAs memory area * @vmi: The vma iterator * @vma: The VMA to modify * @start: The new start * @end: The new end * * Returns: 0 on success, -ENOMEM otherwise */ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { struct vma_prepare vp; WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); if (vma->vm_start < start) vma_iter_config(vmi, vma->vm_start, start); else vma_iter_config(vmi, end, vma->vm_end); if (vma_iter_prealloc(vmi, NULL)) return -ENOMEM; vma_start_write(vma); init_vma_prep(&vp, vma); vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); vma_iter_clear(vmi); vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; vma_complete(&vp, vmi, vma->vm_mm); return 0; } /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those if the caller indicates * the current vma may be removed as part of the merge. */ static inline bool is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, struct anon_vma_name *anon_name, bool may_remove_vma) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we * match the flags but dirty bit -- the caller should mark * merged VMA as dirty. If dirty bit won't be excluded from * comparison, we increase pressure on the memory system forcing * the kernel to generate new VMAs when old one could be * extended instead. */ if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) return false; if (vma->vm_file != file) return false; if (may_remove_vma && vma->vm_ops && vma->vm_ops->close) return false; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) return false; if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) return false; return true; } static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, struct anon_vma *anon_vma2, struct vm_area_struct *vma) { /* * The list_is_singular() test is to avoid merging VMA cloned from * parents. This can improve scalability caused by anon_vma lock. */ if ((!anon_vma1 || !anon_vma2) && (!vma || list_is_singular(&vma->anon_vma_chain))) return true; return anon_vma1 == anon_vma2; } /* * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) * in front of (at a lower virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We don't check here for the merged mmap wrapping around the end of pagecache * indices (16TB on ia32) because do_mmap() does not permit mmap's which * wrap, nor mmaps which cover the final page at index -1UL. * * We assume the vma may be removed as part of the merge. */ static bool can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, struct anon_vma_name *anon_name) { if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return true; } return false; } /* * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) * beyond (at a higher virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We assume that vma is not removed as part of the merge. */ static bool can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, struct anon_vma_name *anon_name) { if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); if (vma->vm_pgoff + vm_pglen == vm_pgoff) return true; } return false; } /* * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), * figure out whether that can be merged with its predecessor or its * successor. Or both (it neatly fills a hole). * * In most cases - when called for mmap, brk or mremap - [addr,end) is * certain not to be mapped by the time vma_merge is called; but when * called for mprotect, it is certain to be already mapped (either at * an offset within prev, or at the start of next), and the flags of * this area are about to be changed to vm_flags - and the no-change * case has already been eliminated. * * The following mprotect cases have to be considered, where **** is * the area passed down from mprotect_fixup, never extending beyond one * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts * at the same address as **** and is of the same or larger span, and * NNNN the next vma after ****: * * **** **** **** * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC * cannot merge might become might become * PPNNNNNNNNNN PPPPPPPPPPCC * mmap, brk or case 4 below case 5 below * mremap move: * **** **** * PPPP NNNN PPPPCCCCNNNN * might become might become * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8 * * It is important for case 8 that the vma CCCC overlapping the * region **** is never going to extended over NNNN. Instead NNNN must * be extended in region **** and CCCC must be removed. This way in * all cases where vma_merge succeeds, the moment vma_merge drops the * rmap_locks, the properties of the merged vma will be already * correct for the whole merged range. Some of those properties like * vm_page_prot/vm_flags may be accessed by rmap_walks and they must * be correct for the whole merged range immediately after the * rmap_locks are released. Otherwise if NNNN would be removed and * CCCC would be extended over the NNNN range, remove_migration_ptes * or other rmap walkers (if working on addresses beyond the "end" * parameter) may establish ptes with the wrong permissions of CCCC * instead of the right permissions of NNNN. * * In the code below: * PPPP is represented by *prev * CCCC is represented by *curr or not represented at all (NULL) * NNNN is represented by *next or not represented at all (NULL) * **** is not represented - it will be merged and the vma containing the * area is returned, or the function will return NULL */ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, struct anon_vma_name *anon_name) { struct vm_area_struct *curr, *next, *res; struct vm_area_struct *vma, *adjust, *remove, *remove2; struct vma_prepare vp; pgoff_t vma_pgoff; int err = 0; bool merge_prev = false; bool merge_next = false; bool vma_expanded = false; unsigned long vma_start = addr; unsigned long vma_end = end; pgoff_t pglen = (end - addr) >> PAGE_SHIFT; long adj_start = 0; /* * We later require that vma->vm_flags == vm_flags, * so this tests vma->vm_flags & VM_SPECIAL, too. */ if (vm_flags & VM_SPECIAL) return NULL; /* Does the input range span an existing VMA? (cases 5 - 8) */ curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end); if (!curr || /* cases 1 - 4 */ end == curr->vm_end) /* cases 6 - 8, adjacent VMA */ next = vma_lookup(mm, end); else next = NULL; /* case 5 */ if (prev) { vma_start = prev->vm_start; vma_pgoff = prev->vm_pgoff; /* Can we merge the predecessor? */ if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, vm_userfaultfd_ctx, anon_name)) { merge_prev = true; vma_prev(vmi); } } /* Can we merge the successor? */ if (next && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, vm_userfaultfd_ctx, anon_name)) { merge_next = true; } /* Verify some invariant that must be enforced by the caller. */ VM_WARN_ON(prev && addr <= prev->vm_start); VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); VM_WARN_ON(addr >= end); if (!merge_prev && !merge_next) return NULL; /* Not mergeable. */ if (merge_prev) vma_start_write(prev); res = vma = prev; remove = remove2 = adjust = NULL; /* Can we merge both the predecessor and the successor? */ if (merge_prev && merge_next && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { vma_start_write(next); remove = next; /* case 1 */ vma_end = next->vm_end; err = dup_anon_vma(prev, next); if (curr) { /* case 6 */ vma_start_write(curr); remove = curr; remove2 = next; if (!next->anon_vma) err = dup_anon_vma(prev, curr); } } else if (merge_prev) { /* case 2 */ if (curr) { vma_start_write(curr); err = dup_anon_vma(prev, curr); if (end == curr->vm_end) { /* case 7 */ remove = curr; } else { /* case 5 */ adjust = curr; adj_start = (end - curr->vm_start); } } } else { /* merge_next */ vma_start_write(next); res = next; if (prev && addr < prev->vm_end) { /* case 4 */ vma_start_write(prev); vma_end = addr; adjust = next; adj_start = -(prev->vm_end - addr); err = dup_anon_vma(next, prev); } else { /* * Note that cases 3 and 8 are the ONLY ones where prev * is permitted to be (but is not necessarily) NULL. */ vma = next; /* case 3 */ vma_start = addr; vma_end = next->vm_end; vma_pgoff = next->vm_pgoff - pglen; if (curr) { /* case 8 */ vma_pgoff = curr->vm_pgoff; vma_start_write(curr); remove = curr; err = dup_anon_vma(next, curr); } } } /* Error in anon_vma clone. */ if (err) return NULL; if (vma_start < vma->vm_start || vma_end > vma->vm_end) vma_expanded = true; if (vma_expanded) { vma_iter_config(vmi, vma_start, vma_end); } else { vma_iter_config(vmi, adjust->vm_start + adj_start, adjust->vm_end); } if (vma_iter_prealloc(vmi, vma)) return NULL; init_multi_vma_prep(&vp, vma, adjust, remove, remove2); VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && vp.anon_vma != adjust->anon_vma); vma_prepare(&vp); vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); vma->vm_start = vma_start; vma->vm_end = vma_end; vma->vm_pgoff = vma_pgoff; if (vma_expanded) vma_iter_store(vmi, vma); if (adj_start) { adjust->vm_start += adj_start; adjust->vm_pgoff += adj_start >> PAGE_SHIFT; if (adj_start < 0) { WARN_ON(vma_expanded); vma_iter_store(vmi, next); } } vma_complete(&vp, vmi, mm); khugepaged_enter_vma(res, vm_flags); return res; } /* * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. * * They need to have the same vm_file, and the flags can only differ * in things that mprotect may change. * * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that * we can merge the two vma's. For example, we refuse to merge a vma if * there is a vm_ops->close() function, because that indicates that the * driver is doing some kind of reference counting. But that doesn't * really matter for the anon_vma sharing case. */ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) { return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } /* * Do some basic sanity checking to see if we can re-use the anon_vma * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be * the same as 'old', the other will be the new one that is trying * to share the anon_vma. * * NOTE! This runs with mmap_lock held for reading, so it is possible that * the anon_vma of 'old' is concurrently in the process of being set up * by another page fault trying to merge _that_. But that's ok: if it * is being set up, that automatically means that it will be a singleton * acceptable for merging, so we can do all of this optimistically. But * we do that READ_ONCE() to make sure that we never re-load the pointer. * * IOW: that the "list_is_singular()" test on the anon_vma_chain only * matters for the 'stable anon_vma' case (ie the thing we want to avoid * is to return an anon_vma that is "complex" due to having gone through * a fork). * * We also make sure that the two vma's are compatible (adjacent, * and with the same memory policies). That's all stable, even with just * a read lock on the mmap_lock. */ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) { if (anon_vma_compatible(a, b)) { struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); if (anon_vma && list_is_singular(&old->anon_vma_chain)) return anon_vma; } return NULL; } /* * find_mergeable_anon_vma is used by anon_vma_prepare, to check * neighbouring vmas for a suitable anon_vma, before it goes off * to allocate a new anon_vma. It checks because a repetitive * sequence of mprotects and faults may otherwise lead to distinct * anon_vmas being allocated, preventing vma merge in subsequent * mprotect. */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end); struct anon_vma *anon_vma = NULL; struct vm_area_struct *prev, *next; /* Try next first. */ next = mas_walk(&mas); if (next) { anon_vma = reusable_anon_vma(next, vma, next); if (anon_vma) return anon_vma; } prev = mas_prev(&mas, 0); VM_BUG_ON_VMA(prev != vma, vma); prev = mas_prev(&mas, 0); /* Try prev next. */ if (prev) anon_vma = reusable_anon_vma(prev, prev, vma); /* * We might reach here with anon_vma == NULL if we can't find * any reusable anon_vma. * There's no absolute need to look only at touching neighbours: * we could search further afield for "compatible" anon_vmas. * But it would probably just be a waste of time searching, * or lead to too many vmas hanging off the same anon_vma. * We're trying to allow mprotect remerging later on, * not trying to minimize memory used for anon_vmas. */ return anon_vma; } /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr */ static inline unsigned long round_hint_to_min(unsigned long hint) { hint &= PAGE_MASK; if (((void *)hint != NULL) && (hint < mmap_min_addr)) return PAGE_ALIGN(mmap_min_addr); return hint; } bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) return true; locked_pages = bytes >> PAGE_SHIFT; locked_pages += mm->locked_vm; limit_pages = rlimit(RLIMIT_MEMLOCK); limit_pages >>= PAGE_SHIFT; return locked_pages <= limit_pages; } static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) { if (S_ISREG(inode->i_mode)) return MAX_LFS_FILESIZE; if (S_ISBLK(inode->i_mode)) return MAX_LFS_FILESIZE; if (S_ISSOCK(inode->i_mode)) return MAX_LFS_FILESIZE; /* Special "we do even unsigned file positions" case */ if (file->f_mode & FMODE_UNSIGNED_OFFSET) return 0; /* Yes, random drivers might want more. But I'm tired of buggy drivers */ return ULONG_MAX; } static inline bool file_mmap_ok(struct file *file, struct inode *inode, unsigned long pgoff, unsigned long len) { u64 maxsize = file_mmap_size_max(file, inode); if (maxsize && len > maxsize) return false; maxsize -= len; if (pgoff > maxsize >> PAGE_SHIFT) return false; return true; } /* * The caller must write-lock current->mm->mmap_lock. */ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf) { struct mm_struct *mm = current->mm; int pkey = 0; *populate = 0; if (!len) return -EINVAL; /* * Does the application expect PROT_READ to imply PROT_EXEC? * * (the exception is when the underlying filesystem is noexec * mounted, in which case we dont add PROT_EXEC.) */ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; /* force arch specific MAP_FIXED handling in get_unmapped_area */ if (flags & MAP_FIXED_NOREPLACE) flags |= MAP_FIXED; if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); /* Careful about overflows.. */ len = PAGE_ALIGN(len); if (!len) return -ENOMEM; /* offset overflow? */ if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) return -EOVERFLOW; /* Too many mappings? */ if (mm->map_count > sysctl_max_map_count) return -ENOMEM; /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ addr = get_unmapped_area(file, addr, len, pgoff, flags); if (IS_ERR_VALUE(addr)) return addr; if (flags & MAP_FIXED_NOREPLACE) { if (find_vma_intersection(mm, addr, addr + len)) return -EEXIST; } if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) pkey = 0; } /* Do simple checking here so the lower-level routines won't have * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; if (!mlock_future_ok(mm, vm_flags, len)) return -EAGAIN; if (file) { struct inode *inode = file_inode(file); unsigned long flags_mask; if (!file_mmap_ok(file, inode, pgoff, len)) return -EOVERFLOW; flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags; switch (flags & MAP_TYPE) { case MAP_SHARED: /* * Force use of MAP_SHARED_VALIDATE with non-legacy * flags. E.g. MAP_SYNC is dangerous to use with * MAP_SHARED as you don't know which consistency model * you will get. We silently ignore unsupported flags * with MAP_SHARED to preserve backward compatibility. */ flags &= LEGACY_MAP_MASK; fallthrough; case MAP_SHARED_VALIDATE: if (flags & ~flags_mask) return -EOPNOTSUPP; if (prot & PROT_WRITE) { if (!(file->f_mode & FMODE_WRITE)) return -EACCES; if (IS_SWAPFILE(file->f_mapping->host)) return -ETXTBSY; } /* * Make sure we don't allow writing to an append-only * file.. */ if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) return -EACCES; vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; if (path_noexec(&file->f_path)) { if (vm_flags & VM_EXEC) return -EPERM; vm_flags &= ~VM_MAYEXEC; } if (!file->f_op->mmap) return -ENODEV; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; break; default: return -EINVAL; } } else { switch (flags & MAP_TYPE) { case MAP_SHARED: if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; /* * Ignore pgoff. */ pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_PRIVATE: /* * Set pgoff according to addr for anon_vma. */ pgoff = addr >> PAGE_SHIFT; break; default: return -EINVAL; } } /* * Set 'VM_NORESERVE' if we should not account for the * memory use of this mapping. */ if (flags & MAP_NORESERVE) { /* We honor MAP_NORESERVE if allowed to overcommit */ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER) vm_flags |= VM_NORESERVE; /* hugetlb applies strict overcommit unless MAP_NORESERVE */ if (file && is_file_hugepages(file)) vm_flags |= VM_NORESERVE; } addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) *populate = len; return addr; } unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { struct file *file = NULL; unsigned long retval; if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); if (!file) return -EBADF; if (is_file_hugepages(file)) { len = ALIGN(len, huge_page_size(hstate_file(file))); } else if (unlikely(flags & MAP_HUGETLB)) { retval = -EINVAL; goto out_fput; } } else if (flags & MAP_HUGETLB) { struct hstate *hs; hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (!hs) return -EINVAL; len = ALIGN(len, huge_page_size(hs)); /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) return PTR_ERR(file); } retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); out_fput: if (file) fput(file); return retval; } SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) { return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } #ifdef __ARCH_WANT_SYS_OLD_MMAP struct mmap_arg_struct { unsigned long addr; unsigned long len; unsigned long prot; unsigned long flags; unsigned long fd; unsigned long offset; }; SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) { struct mmap_arg_struct a; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; if (offset_in_page(a.offset)) return -EINVAL; return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) { return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); } static bool vma_is_shared_writable(struct vm_area_struct *vma) { return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == (VM_WRITE | VM_SHARED); } static bool vma_fs_can_writeback(struct vm_area_struct *vma) { /* No managed pages to writeback. */ if (vma->vm_flags & VM_PFNMAP) return false; return vma->vm_file && vma->vm_file->f_mapping && mapping_can_writeback(vma->vm_file->f_mapping); } /* * Does this VMA require the underlying folios to have their dirty state * tracked? */ bool vma_needs_dirty_tracking(struct vm_area_struct *vma) { /* Only shared, writable VMAs require dirty tracking. */ if (!vma_is_shared_writable(vma)) return false; /* Does the filesystem need to be notified? */ if (vm_ops_needs_writenotify(vma->vm_ops)) return true; /* * Even if the filesystem doesn't indicate a need for writenotify, if it * can writeback, dirty tracking is still required. */ return vma_fs_can_writeback(vma); } /* * Some shared mappings will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot * to the private version (using protection_map[] without the * VM_SHARED bit). */ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { /* If it was private or non-writable, the write bit is already clear */ if (!vma_is_shared_writable(vma)) return 0; /* The backer wishes to know when pages are first written to? */ if (vm_ops_needs_writenotify(vma->vm_ops)) return 1; /* The open routine did something to the protections that pgprot_modify * won't preserve? */ if (pgprot_val(vm_page_prot) != pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) return 0; /* * Do we need to track softdirty? hugetlb does not support softdirty * tracking yet. */ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) return 1; /* Do we need write faults for uffd-wp tracking? */ if (userfaultfd_wp(vma)) return 1; /* Can the mapping track the dirty pages? */ return vma_fs_can_writeback(vma); } /* * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) { /* * hugetlb has its own accounting separate from the core VM * VM_HUGETLB may not be set yet so we cannot check for that flag. */ if (file && is_file_hugepages(file)) return 0; return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } /** * unmapped_area() - Find an area between the low_limit and the high_limit with * the correct alignment and offset, all from @info. Note: current->mm is used * for the search. * * @info: The unmapped area information including the range [low_limit - * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) { unsigned long length, gap; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; MA_STATE(mas, &current->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask; if (length < info->length) return -ENOMEM; low_limit = info->low_limit; if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: if (mas_empty_area(&mas, low_limit, high_limit - 1, length)) return -ENOMEM; gap = mas.index; gap += (info->align_offset - gap) & info->align_mask; tmp = mas_next(&mas, ULONG_MAX); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) < gap + length - 1) { low_limit = tmp->vm_end; mas_reset(&mas); goto retry; } } else { tmp = mas_prev(&mas, 0); if (tmp && vm_end_gap(tmp) > gap) { low_limit = vm_end_gap(tmp); mas_reset(&mas); goto retry; } } return gap; } /** * unmapped_area_topdown() - Find an area between the low_limit and the * high_limit with the correct alignment and offset at the highest available * address, all from @info. Note: current->mm is used for the search. * * @info: The unmapped area information including the range [low_limit - * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { unsigned long length, gap, gap_end; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; MA_STATE(mas, &current->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask; if (length < info->length) return -ENOMEM; low_limit = info->low_limit; if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: if (mas_empty_area_rev(&mas, low_limit, high_limit - 1, length)) return -ENOMEM; gap = mas.last + 1 - info->length; gap -= (gap - info->align_offset) & info->align_mask; gap_end = mas.last; tmp = mas_next(&mas, ULONG_MAX); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) <= gap_end) { high_limit = vm_start_gap(tmp); mas_reset(&mas); goto retry; } } else { tmp = mas_prev(&mas, 0); if (tmp && vm_end_gap(tmp) > gap) { high_limit = tmp->vm_start; mas_reset(&mas); goto retry; } } return gap; } /* * Search for an unmapped address range. * * We are looking for a range that: * - does not intersect with any VMA; * - is contained within the [low_limit, high_limit) interval; * - is at least the desired size. * - satisfies (begin_addr & align_mask) == (align_offset & align_mask) */ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) { unsigned long addr; if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) addr = unmapped_area_topdown(info); else addr = unmapped_area(info); trace_vm_unmapped_area(addr, info); return addr; } /* Get an address range which is currently unmapped. * For shmat() with addr=0. * * Ugly calling convention alert: * Return value with the low bits set means error value, * ie * if (ret & ~PAGE_MASK) * error = ret; * * This function "knows" that -ENOMEM has the bits set. */ unsigned long generic_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vm_unmapped_area_info info; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) return addr; if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; } info.flags = 0; info.length = len; info.low_limit = mm->mmap_base; info.high_limit = mmap_end; info.align_mask = 0; info.align_offset = 0; return vm_unmapped_area(&info); } #ifndef HAVE_ARCH_UNMAPPED_AREA unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return generic_get_unmapped_area(filp, addr, len, pgoff, flags); } #endif /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): */ unsigned long generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; struct vm_unmapped_area_info info; const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); /* requested length too big for entire address space */ if (len > mmap_end - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) return addr; /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma_prev(mm, addr, &prev); if (mmap_end - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma)) && (!prev || addr >= vm_end_gap(prev))) return addr; } info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); info.align_mask = 0; info.align_offset = 0; addr = vm_unmapped_area(&info); /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() * allocations. */ if (offset_in_page(addr)) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = mmap_end; addr = vm_unmapped_area(&info); } return addr; } #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags); } #endif unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); unsigned long error = arch_mmap_check(addr, len, flags); if (error) return error; /* Careful about overflows.. */ if (len > TASK_SIZE) return -ENOMEM; get_area = current->mm->get_unmapped_area; if (file) { if (file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; } else if (flags & MAP_SHARED) { /* * mmap_region() will call shmem_zero_setup() to create a file, * so use shmem's get_unmapped_area in case it can be huge. * do_mmap() will clear pgoff, so match alignment. */ pgoff = 0; get_area = shmem_get_unmapped_area; } addr = get_area(file, addr, len, pgoff, flags); if (IS_ERR_VALUE(addr)) return addr; if (addr > TASK_SIZE - len) return -ENOMEM; if (offset_in_page(addr)) return -EINVAL; error = security_mmap_addr(addr); return error ? error : addr; } EXPORT_SYMBOL(get_unmapped_area); /** * find_vma_intersection() - Look up the first VMA which intersects the interval * @mm: The process address space. * @start_addr: The inclusive start user address. * @end_addr: The exclusive end user address. * * Returns: The first VMA within the provided range, %NULL otherwise. Assumes * start_addr < end_addr. */ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr) { unsigned long index = start_addr; mmap_assert_locked(mm); return mt_find(&mm->mm_mt, &index, end_addr - 1); } EXPORT_SYMBOL(find_vma_intersection); /** * find_vma() - Find the VMA for a given address, or the next VMA. * @mm: The mm_struct to check * @addr: The address * * Returns: The VMA associated with addr, or the next VMA. * May return %NULL in the case of no VMA at addr or above. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { unsigned long index = addr; mmap_assert_locked(mm); return mt_find(&mm->mm_mt, &index, ULONG_MAX); } EXPORT_SYMBOL(find_vma); /** * find_vma_prev() - Find the VMA for a given address, or the next vma and * set %pprev to the previous VMA, if any. * @mm: The mm_struct to check * @addr: The address * @pprev: The pointer to set to the previous VMA * * Note that RCU lock is missing here since the external mmap_lock() is used * instead. * * Returns: The VMA associated with @addr, or the next vma. * May return %NULL in the case of no vma at addr or above. */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { struct vm_area_struct *vma; MA_STATE(mas, &mm->mm_mt, addr, addr); vma = mas_walk(&mas); *pprev = mas_prev(&mas, 0); if (!vma) vma = mas_next(&mas, ULONG_MAX); return vma; } /* * Verify that the stack growth is acceptable and * update accounting. This is shared with both the * grow-up and grow-down cases. */ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) { struct mm_struct *mm = vma->vm_mm; unsigned long new_start; /* address space limit tests */ if (!may_expand_vm(mm, vma->vm_flags, grow)) return -ENOMEM; /* Stack limit test */ if (size > rlimit(RLIMIT_STACK)) return -ENOMEM; /* mlock limit tests */ if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) return -ENOMEM; /* Check to ensure the stack will not grow into a hugetlb-only region */ new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : vma->vm_end - size; if (is_hugepage_only_range(vma->vm_mm, new_start, size)) return -EFAULT; /* * Overcommit.. This must be the final test, as it will * update security statistics. */ if (security_vm_enough_memory_mm(mm, grow)) return -ENOMEM; return 0; } #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) /* * PA-RISC uses this for its stack; IA64 for its Register Backing Store. * vma is the last one with address > vma->vm_end. Have to extend vma. */ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next; unsigned long gap_addr; int error = 0; MA_STATE(mas, &mm->mm_mt, vma->vm_start, address); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; /* Guard against exceeding limits of the address space. */ address &= PAGE_MASK; if (address >= (TASK_SIZE & PAGE_MASK)) return -ENOMEM; address += PAGE_SIZE; /* Enforce stack_guard_gap */ gap_addr = address + stack_guard_gap; /* Guard against overflow */ if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; next = find_vma_intersection(mm, vma->vm_end, gap_addr); if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ } if (next) mas_prev_range(&mas, address); __mas_set_range(&mas, vma->vm_start, address - 1); if (mas_preallocate(&mas, vma, GFP_KERNEL)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { mas_destroy(&mas); return -ENOMEM; } /* Lock the VMA before expanding to prevent concurrent page faults */ vma_start_write(vma); /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_lock in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks.